mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion pipeline: If <body> tag is not under <html> move it to the correct place. LIT Input: Strip embedded <metadata> and <guide> elements. Fixes #4712 (Unable to convert .rtf and .lit files to .EPUB)
This commit is contained in:
parent
68beb72bbd
commit
5e93ea1da2
@ -26,6 +26,11 @@ class LITInput(InputFormatPlugin):
|
|||||||
for item in oeb.spine:
|
for item in oeb.spine:
|
||||||
root = item.data
|
root = item.data
|
||||||
if not hasattr(root, 'xpath'): continue
|
if not hasattr(root, 'xpath'): continue
|
||||||
|
for bad in ('metadata', 'guide'):
|
||||||
|
metadata = XPath('//h:'+bad)(root)
|
||||||
|
if metadata:
|
||||||
|
for x in metadata:
|
||||||
|
x.getparent().remove(x)
|
||||||
body = XPath('//h:body')(root)
|
body = XPath('//h:body')(root)
|
||||||
if body:
|
if body:
|
||||||
body = body[0]
|
body = body[0]
|
||||||
|
@ -909,6 +909,12 @@ class Manifest(object):
|
|||||||
'content': '%s; charset=utf-8' % XHTML_NS})
|
'content': '%s; charset=utf-8' % XHTML_NS})
|
||||||
# Ensure has a <body/>
|
# Ensure has a <body/>
|
||||||
if not xpath(data, '/h:html/h:body'):
|
if not xpath(data, '/h:html/h:body'):
|
||||||
|
body = xpath(data, '//h:body')
|
||||||
|
if body:
|
||||||
|
body = body[0]
|
||||||
|
body.getparent().remove(body)
|
||||||
|
data.append(body)
|
||||||
|
else:
|
||||||
self.oeb.logger.warn(
|
self.oeb.logger.warn(
|
||||||
'File %r missing <body/> element' % self.href)
|
'File %r missing <body/> element' % self.href)
|
||||||
etree.SubElement(data, XHTML('body'))
|
etree.SubElement(data, XHTML('body'))
|
||||||
|
@ -43,6 +43,10 @@ class Image(Element):
|
|||||||
self.bottom = self.top + self.height
|
self.bottom = self.top + self.height
|
||||||
self.right = self.left + self.width
|
self.right = self.left + self.width
|
||||||
|
|
||||||
|
def to_html(self):
|
||||||
|
return '<img src="%s" width="%dpx" height="%dpx"/>' % \
|
||||||
|
(self.src, int(self.width), int(self.height))
|
||||||
|
|
||||||
|
|
||||||
class Text(Element):
|
class Text(Element):
|
||||||
|
|
||||||
@ -66,8 +70,6 @@ class Text(Element):
|
|||||||
self.raw = text.text if text.text else u''
|
self.raw = text.text if text.text else u''
|
||||||
for x in text.iterchildren():
|
for x in text.iterchildren():
|
||||||
self.raw += etree.tostring(x, method='xml', encoding=unicode)
|
self.raw += etree.tostring(x, method='xml', encoding=unicode)
|
||||||
if x.tail:
|
|
||||||
self.raw += x.tail
|
|
||||||
self.average_character_width = self.width/len(self.text_as_string)
|
self.average_character_width = self.width/len(self.text_as_string)
|
||||||
|
|
||||||
def coalesce(self, other, page_number):
|
def coalesce(self, other, page_number):
|
||||||
@ -86,6 +88,9 @@ class Text(Element):
|
|||||||
self.average_character_width = (self.average_character_width +
|
self.average_character_width = (self.average_character_width +
|
||||||
other.average_character_width)/2.0
|
other.average_character_width)/2.0
|
||||||
|
|
||||||
|
def to_html(self):
|
||||||
|
return self.raw
|
||||||
|
|
||||||
class FontSizeStats(dict):
|
class FontSizeStats(dict):
|
||||||
|
|
||||||
def __init__(self, stats):
|
def __init__(self, stats):
|
||||||
@ -108,6 +113,11 @@ class Interval(object):
|
|||||||
right = min(self.right, other.right)
|
right = min(self.right, other.right)
|
||||||
return Interval(left, right)
|
return Interval(left, right)
|
||||||
|
|
||||||
|
def centered_in(self, parent):
|
||||||
|
left = abs(self.left - parent.left)
|
||||||
|
right = abs(self.right - parent.right)
|
||||||
|
return abs(left-right) < 3
|
||||||
|
|
||||||
def __nonzero__(self):
|
def __nonzero__(self):
|
||||||
return self.width > 0
|
return self.width > 0
|
||||||
|
|
||||||
@ -146,6 +156,9 @@ class Column(object):
|
|||||||
for x in self.elements:
|
for x in self.elements:
|
||||||
yield x
|
yield x
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.elements)
|
||||||
|
|
||||||
def contains(self, elem):
|
def contains(self, elem):
|
||||||
return elem.left > self.left - self.HFUZZ*self.width and \
|
return elem.left > self.left - self.HFUZZ*self.width and \
|
||||||
elem.right < self.right + self.HFUZZ*self.width
|
elem.right < self.right + self.HFUZZ*self.width
|
||||||
@ -174,17 +187,42 @@ class Column(object):
|
|||||||
class Box(list):
|
class Box(list):
|
||||||
|
|
||||||
def __init__(self, type='p'):
|
def __init__(self, type='p'):
|
||||||
self.type = type
|
self.tag = type
|
||||||
|
|
||||||
|
def to_html(self):
|
||||||
|
ans = ['<%s>'%self.tag]
|
||||||
|
for elem in self:
|
||||||
|
if isinstance(elem, int):
|
||||||
|
ans.append('<a name="page_%d"/>'%elem)
|
||||||
|
else:
|
||||||
|
ans.append(elem.to_html()+' ')
|
||||||
|
ans.append('</%s>'%self.tag)
|
||||||
|
return ans
|
||||||
|
|
||||||
class ImageBox(Box):
|
class ImageBox(Box):
|
||||||
|
|
||||||
def __init__(self, img):
|
def __init__(self, img):
|
||||||
Box.__init__(self, type='img')
|
Box.__init__(self)
|
||||||
self.img = img
|
self.img = img
|
||||||
|
|
||||||
|
def to_html(self):
|
||||||
|
ans = ['<div style="text-align:center">']
|
||||||
|
ans.append(self.img.to_html())
|
||||||
|
if len(self) > 0:
|
||||||
|
ans.append('<br/>')
|
||||||
|
for elem in self:
|
||||||
|
if isinstance(elem, int):
|
||||||
|
ans.append('<a name="page_%d"/>'%elem)
|
||||||
|
else:
|
||||||
|
ans.append(elem.to_html()+' ')
|
||||||
|
ans.append('</div>')
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
class Region(object):
|
class Region(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, opts, log):
|
||||||
|
self.opts, self.log = opts, log
|
||||||
self.columns = []
|
self.columns = []
|
||||||
self.top = self.bottom = self.left = self.right = self.width = self.height = 0
|
self.top = self.bottom = self.left = self.right = self.width = self.height = 0
|
||||||
|
|
||||||
@ -217,6 +255,40 @@ class Region(object):
|
|||||||
def is_empty(self):
|
def is_empty(self):
|
||||||
return len(self.columns) == 0
|
return len(self.columns) == 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_small(self):
|
||||||
|
max_lines = 0
|
||||||
|
for c in self.columns:
|
||||||
|
max_lines = max(max_lines, len(c))
|
||||||
|
return max_lines > 2
|
||||||
|
|
||||||
|
def absorb(self, singleton):
|
||||||
|
|
||||||
|
def most_suitable_column(elem):
|
||||||
|
mc, mw = None, 0
|
||||||
|
for c in self.columns:
|
||||||
|
i = Interval(c.left, c.right)
|
||||||
|
e = Interval(elem.left, elem.right)
|
||||||
|
w = i.intersection(e).width
|
||||||
|
if w > mw:
|
||||||
|
mc, mw = c, w
|
||||||
|
if mc is None:
|
||||||
|
self.log.warn('No suitable column for singleton',
|
||||||
|
elem.to_html())
|
||||||
|
mc = self.columns[0]
|
||||||
|
return mc
|
||||||
|
|
||||||
|
print
|
||||||
|
for c in singleton.columns:
|
||||||
|
for elem in c:
|
||||||
|
col = most_suitable_column(elem)
|
||||||
|
if self.opts.verbose > 3:
|
||||||
|
idx = self.columns.index(col)
|
||||||
|
self.log.debug(u'Absorbing singleton %s into column'%elem.to_html(),
|
||||||
|
idx)
|
||||||
|
col.add(elem)
|
||||||
|
|
||||||
|
|
||||||
def collect_stats(self):
|
def collect_stats(self):
|
||||||
for column in self.columns:
|
for column in self.columns:
|
||||||
column.collect_stats()
|
column.collect_stats()
|
||||||
@ -231,7 +303,6 @@ class Region(object):
|
|||||||
self.elements = []
|
self.elements = []
|
||||||
for x in self.columns:
|
for x in self.columns:
|
||||||
self.elements.extend(x)
|
self.elements.extend(x)
|
||||||
|
|
||||||
self.boxes = [Box()]
|
self.boxes = [Box()]
|
||||||
for i, elem in enumerate(self.elements):
|
for i, elem in enumerate(self.elements):
|
||||||
if isinstance(elem, Image):
|
if isinstance(elem, Image):
|
||||||
@ -341,7 +412,7 @@ class Page(object):
|
|||||||
return
|
return
|
||||||
for i, x in enumerate(self.elements):
|
for i, x in enumerate(self.elements):
|
||||||
x.idx = i
|
x.idx = i
|
||||||
current_region = Region()
|
current_region = Region(self.opts, self.log)
|
||||||
processed = set([])
|
processed = set([])
|
||||||
for x in self.elements:
|
for x in self.elements:
|
||||||
if x in processed: continue
|
if x in processed: continue
|
||||||
@ -350,12 +421,42 @@ class Page(object):
|
|||||||
processed.update(elems)
|
processed.update(elems)
|
||||||
if not current_region.contains(columns):
|
if not current_region.contains(columns):
|
||||||
self.regions.append(current_region)
|
self.regions.append(current_region)
|
||||||
current_region = Region()
|
current_region = Region(self.opts, self.log)
|
||||||
current_region.add(columns)
|
current_region.add(columns)
|
||||||
if not current_region.is_empty:
|
if not current_region.is_empty:
|
||||||
self.regions.append(current_region)
|
self.regions.append(current_region)
|
||||||
|
|
||||||
|
self.coalesce_regions()
|
||||||
|
|
||||||
|
def coalesce_regions(self):
|
||||||
|
# find contiguous sets of small regions
|
||||||
|
# absorb into a neighboring region (prefer the one with number of cols
|
||||||
|
# closer to the avg number of cols in the set, if equal use large
|
||||||
|
# region)
|
||||||
|
# merge contiguous regions that can contain each other
|
||||||
|
absorbed = set([])
|
||||||
|
found = True
|
||||||
|
while found:
|
||||||
|
found = False
|
||||||
|
for i, region in enumerate(self.regions):
|
||||||
|
if region.is_small:
|
||||||
|
found = True
|
||||||
|
regions = []
|
||||||
|
for j in range(i+1, len(self.regions)):
|
||||||
|
if self.regions[j].is_small:
|
||||||
|
regions.append(self.regions[j])
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
prev = None if i == 0 else i-1
|
||||||
|
next = j if self.regions[j] not in regions else None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def sort_into_columns(self, elem, neighbors):
|
def sort_into_columns(self, elem, neighbors):
|
||||||
|
neighbors.add(elem)
|
||||||
|
neighbors = sorted(neighbors, cmp=lambda x,y:cmp(x.left, y.left))
|
||||||
|
if self.opts.verbose > 3:
|
||||||
|
self.log.debug('Neighbors:', [x.to_html() for x in neighbors])
|
||||||
columns = [Column()]
|
columns = [Column()]
|
||||||
columns[0].add(elem)
|
columns[0].add(elem)
|
||||||
for x in neighbors:
|
for x in neighbors:
|
||||||
@ -421,6 +522,9 @@ class PDFDocument(object):
|
|||||||
page.first_pass()
|
page.first_pass()
|
||||||
page.second_pass()
|
page.second_pass()
|
||||||
|
|
||||||
|
self.linearize()
|
||||||
|
self.render()
|
||||||
|
|
||||||
def collect_font_statistics(self):
|
def collect_font_statistics(self):
|
||||||
self.font_size_stats = {}
|
self.font_size_stats = {}
|
||||||
for p in self.pages:
|
for p in self.pages:
|
||||||
@ -432,5 +536,43 @@ class PDFDocument(object):
|
|||||||
|
|
||||||
self.font_size_stats = FontSizeStats(self.font_size_stats)
|
self.font_size_stats = FontSizeStats(self.font_size_stats)
|
||||||
|
|
||||||
|
def linearize(self):
|
||||||
|
self.elements = []
|
||||||
|
last_region = last_block = None
|
||||||
|
for page in self.pages:
|
||||||
|
page_number_inserted = False
|
||||||
|
for region in page.regions:
|
||||||
|
merge_first_block = last_region is not None and \
|
||||||
|
len(last_region.columns) == len(region.columns) and \
|
||||||
|
not hasattr(last_block, 'img')
|
||||||
|
for i, block in enumerate(region.boxes):
|
||||||
|
if merge_first_block:
|
||||||
|
merge_first_block = False
|
||||||
|
if not page_number_inserted:
|
||||||
|
last_block.append(page.number)
|
||||||
|
page_number_inserted = True
|
||||||
|
for elem in block:
|
||||||
|
last_block.append(elem)
|
||||||
|
else:
|
||||||
|
if not page_number_inserted:
|
||||||
|
block.insert(0, page.number)
|
||||||
|
page_number_inserted = True
|
||||||
|
self.elements.append(block)
|
||||||
|
last_block = block
|
||||||
|
last_region = region
|
||||||
|
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
html = ['<?xml version="1.0" encoding="UTF-8"?>',
|
||||||
|
'<html xmlns="http://www.w3.org/1999/xhtml">', '<head>',
|
||||||
|
'<title>PDF Reflow conversion</title>', '</head>', '<body>',
|
||||||
|
'<div>']
|
||||||
|
for elem in self.elements:
|
||||||
|
html.extend(elem.to_html())
|
||||||
|
html += ['</body>', '</html>']
|
||||||
|
with open('index.html', 'wb') as f:
|
||||||
|
f.write((u'\n'.join(html)).encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -195,9 +195,9 @@ class RTFInput(InputFormatPlugin):
|
|||||||
fname = self.preprocess(stream.name)
|
fname = self.preprocess(stream.name)
|
||||||
try:
|
try:
|
||||||
xml = self.generate_xml(fname)
|
xml = self.generate_xml(fname)
|
||||||
except RtfInvalidCodeException:
|
except RtfInvalidCodeException, e:
|
||||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||||
'support. Convert it to HTML first and then try it.'))
|
'support. Convert it to HTML first and then try it.\n%s')%e)
|
||||||
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
||||||
if d:
|
if d:
|
||||||
imap = {}
|
imap = {}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user