From 5e93ea1da26da5594c0a5a5853d6023fb220ea88 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 28 Jan 2010 09:43:14 -0700 Subject: [PATCH] Conversion pipeline: If tag is not under move it to the correct place. LIT Input: Strip embedded and elements. Fixes #4712 (Unable to convert .rtf and .lit files to .EPUB) --- src/calibre/ebooks/lit/input.py | 5 + src/calibre/ebooks/oeb/base.py | 12 ++- src/calibre/ebooks/pdf/reflow.py | 158 +++++++++++++++++++++++++++++-- src/calibre/ebooks/rtf/input.py | 4 +- 4 files changed, 166 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 8655d8b189..89873196c9 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -26,6 +26,11 @@ class LITInput(InputFormatPlugin): for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue + for bad in ('metadata', 'guide'): + metadata = XPath('//h:'+bad)(root) + if metadata: + for x in metadata: + x.getparent().remove(x) body = XPath('//h:body')(root) if body: body = body[0] diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 18d3de1e56..c93a0689b2 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -909,9 +909,15 @@ class Manifest(object): 'content': '%s; charset=utf-8' % XHTML_NS}) # Ensure has a if not xpath(data, '/h:html/h:body'): - self.oeb.logger.warn( - 'File %r missing element' % self.href) - etree.SubElement(data, XHTML('body')) + body = xpath(data, '//h:body') + if body: + body = body[0] + body.getparent().remove(body) + data.append(body) + else: + self.oeb.logger.warn( + 'File %r missing element' % self.href) + etree.SubElement(data, XHTML('body')) # Remove microsoft office markup r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag] diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 80cfc0bb30..bf2d921a10 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -43,6 +43,10 @@ class Image(Element): self.bottom = self.top + self.height self.right = self.left + self.width + def to_html(self): + return '' % \ + (self.src, int(self.width), int(self.height)) + class Text(Element): @@ -66,8 +70,6 @@ class Text(Element): self.raw = text.text if text.text else u'' for x in text.iterchildren(): self.raw += etree.tostring(x, method='xml', encoding=unicode) - if x.tail: - self.raw += x.tail self.average_character_width = self.width/len(self.text_as_string) def coalesce(self, other, page_number): @@ -86,6 +88,9 @@ class Text(Element): self.average_character_width = (self.average_character_width + other.average_character_width)/2.0 + def to_html(self): + return self.raw + class FontSizeStats(dict): def __init__(self, stats): @@ -108,6 +113,11 @@ class Interval(object): right = min(self.right, other.right) return Interval(left, right) + def centered_in(self, parent): + left = abs(self.left - parent.left) + right = abs(self.right - parent.right) + return abs(left-right) < 3 + def __nonzero__(self): return self.width > 0 @@ -146,6 +156,9 @@ class Column(object): for x in self.elements: yield x + def __len__(self): + return len(self.elements) + def contains(self, elem): return elem.left > self.left - self.HFUZZ*self.width and \ elem.right < self.right + self.HFUZZ*self.width @@ -174,17 +187,42 @@ class Column(object): class Box(list): def __init__(self, type='p'): - self.type = type + self.tag = type + + def to_html(self): + ans = ['<%s>'%self.tag] + for elem in self: + if isinstance(elem, int): + ans.append(''%elem) + else: + ans.append(elem.to_html()+' ') + ans.append(''%self.tag) + return ans class ImageBox(Box): def __init__(self, img): - Box.__init__(self, type='img') + Box.__init__(self) self.img = img + def to_html(self): + ans = ['
'] + ans.append(self.img.to_html()) + if len(self) > 0: + ans.append('
') + for elem in self: + if isinstance(elem, int): + ans.append('
'%elem) + else: + ans.append(elem.to_html()+' ') + ans.append('
') + return ans + + class Region(object): - def __init__(self): + def __init__(self, opts, log): + self.opts, self.log = opts, log self.columns = [] self.top = self.bottom = self.left = self.right = self.width = self.height = 0 @@ -217,6 +255,40 @@ class Region(object): def is_empty(self): return len(self.columns) == 0 + @property + def is_small(self): + max_lines = 0 + for c in self.columns: + max_lines = max(max_lines, len(c)) + return max_lines > 2 + + def absorb(self, singleton): + + def most_suitable_column(elem): + mc, mw = None, 0 + for c in self.columns: + i = Interval(c.left, c.right) + e = Interval(elem.left, elem.right) + w = i.intersection(e).width + if w > mw: + mc, mw = c, w + if mc is None: + self.log.warn('No suitable column for singleton', + elem.to_html()) + mc = self.columns[0] + return mc + + print + for c in singleton.columns: + for elem in c: + col = most_suitable_column(elem) + if self.opts.verbose > 3: + idx = self.columns.index(col) + self.log.debug(u'Absorbing singleton %s into column'%elem.to_html(), + idx) + col.add(elem) + + def collect_stats(self): for column in self.columns: column.collect_stats() @@ -231,7 +303,6 @@ class Region(object): self.elements = [] for x in self.columns: self.elements.extend(x) - self.boxes = [Box()] for i, elem in enumerate(self.elements): if isinstance(elem, Image): @@ -341,7 +412,7 @@ class Page(object): return for i, x in enumerate(self.elements): x.idx = i - current_region = Region() + current_region = Region(self.opts, self.log) processed = set([]) for x in self.elements: if x in processed: continue @@ -350,12 +421,42 @@ class Page(object): processed.update(elems) if not current_region.contains(columns): self.regions.append(current_region) - current_region = Region() + current_region = Region(self.opts, self.log) current_region.add(columns) if not current_region.is_empty: self.regions.append(current_region) + self.coalesce_regions() + + def coalesce_regions(self): + # find contiguous sets of small regions + # absorb into a neighboring region (prefer the one with number of cols + # closer to the avg number of cols in the set, if equal use large + # region) + # merge contiguous regions that can contain each other + absorbed = set([]) + found = True + while found: + found = False + for i, region in enumerate(self.regions): + if region.is_small: + found = True + regions = [] + for j in range(i+1, len(self.regions)): + if self.regions[j].is_small: + regions.append(self.regions[j]) + else: + break + prev = None if i == 0 else i-1 + next = j if self.regions[j] not in regions else None + + + def sort_into_columns(self, elem, neighbors): + neighbors.add(elem) + neighbors = sorted(neighbors, cmp=lambda x,y:cmp(x.left, y.left)) + if self.opts.verbose > 3: + self.log.debug('Neighbors:', [x.to_html() for x in neighbors]) columns = [Column()] columns[0].add(elem) for x in neighbors: @@ -421,6 +522,9 @@ class PDFDocument(object): page.first_pass() page.second_pass() + self.linearize() + self.render() + def collect_font_statistics(self): self.font_size_stats = {} for p in self.pages: @@ -432,5 +536,43 @@ class PDFDocument(object): self.font_size_stats = FontSizeStats(self.font_size_stats) + def linearize(self): + self.elements = [] + last_region = last_block = None + for page in self.pages: + page_number_inserted = False + for region in page.regions: + merge_first_block = last_region is not None and \ + len(last_region.columns) == len(region.columns) and \ + not hasattr(last_block, 'img') + for i, block in enumerate(region.boxes): + if merge_first_block: + merge_first_block = False + if not page_number_inserted: + last_block.append(page.number) + page_number_inserted = True + for elem in block: + last_block.append(elem) + else: + if not page_number_inserted: + block.insert(0, page.number) + page_number_inserted = True + self.elements.append(block) + last_block = block + last_region = region + + + def render(self): + html = ['', + '', '', + 'PDF Reflow conversion', '', '', + '
'] + for elem in self.elements: + html.extend(elem.to_html()) + html += ['', ''] + with open('index.html', 'wb') as f: + f.write((u'\n'.join(html)).encode('utf-8')) + + diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index ff20793f39..d5e1a95157 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -195,9 +195,9 @@ class RTFInput(InputFormatPlugin): fname = self.preprocess(stream.name) try: xml = self.generate_xml(fname) - except RtfInvalidCodeException: + except RtfInvalidCodeException, e: raise ValueError(_('This RTF file has a feature calibre does not ' - 'support. Convert it to HTML first and then try it.')) + 'support. Convert it to HTML first and then try it.\n%s')%e) d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: imap = {}