mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Some progress on PDF multicol support
This commit is contained in:
parent
92f66311c6
commit
8117551bae
@ -15,7 +15,7 @@ pdfreflow, pdfreflow_err = plugins['pdfreflow']
|
|||||||
class PDFInput(InputFormatPlugin):
|
class PDFInput(InputFormatPlugin):
|
||||||
|
|
||||||
name = 'PDF Input'
|
name = 'PDF Input'
|
||||||
author = 'John Schember'
|
author = 'Kovid Goyal and John Schember'
|
||||||
description = 'Convert PDF files to HTML'
|
description = 'Convert PDF files to HTML'
|
||||||
file_types = set(['pdf'])
|
file_types = set(['pdf'])
|
||||||
|
|
||||||
|
@ -18,9 +18,30 @@ class Font(object):
|
|||||||
self.color = spec.get('color')
|
self.color = spec.get('color')
|
||||||
self.family = spec.get('family')
|
self.family = spec.get('family')
|
||||||
|
|
||||||
class Text(object):
|
class Element(object):
|
||||||
|
|
||||||
def __init__(self, text, font_map, opts, log):
|
def __eq__(self, other):
|
||||||
|
return self.id == other.id
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
return hash(self.id)
|
||||||
|
|
||||||
|
class Image(Element):
|
||||||
|
|
||||||
|
def __init__(self, img, opts, log, idc):
|
||||||
|
self.opts, self.log = opts, log
|
||||||
|
self.id = idc.next()
|
||||||
|
self.top, self.left, self.width, self.height, self.iwidth, self.iheight = \
|
||||||
|
map(float, map(img.get, ('top', 'left', 'rwidth', 'rheight', 'iwidth',
|
||||||
|
'iheight')))
|
||||||
|
self.src = img.get('src')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Text(Element):
|
||||||
|
|
||||||
|
def __init__(self, text, font_map, opts, log, idc):
|
||||||
|
self.id = idc.next()
|
||||||
self.opts, self.log = opts, log
|
self.opts, self.log = opts, log
|
||||||
self.font_map = font_map
|
self.font_map = font_map
|
||||||
self.top, self.left, self.width, self.height = map(float, map(text.get,
|
self.top, self.left, self.width, self.height = map(float, map(text.get,
|
||||||
@ -90,47 +111,6 @@ class Interval(object):
|
|||||||
return hash('(%f,%f)'%self.left, self.right)
|
return hash('(%f,%f)'%self.left, self.right)
|
||||||
|
|
||||||
|
|
||||||
class HorizontalBox(object):
|
|
||||||
|
|
||||||
def __init__(self, base_text):
|
|
||||||
self.texts = [base_text]
|
|
||||||
self.bottom = base_text.bottom
|
|
||||||
self.number_of_columns = None
|
|
||||||
self.column_map = {}
|
|
||||||
|
|
||||||
def append(self, t):
|
|
||||||
self.texts.append(t)
|
|
||||||
|
|
||||||
def sort(self, left_margin, right_margin):
|
|
||||||
self.texts.sort(cmp=lambda x,y: cmp(x.left, y.left))
|
|
||||||
self.top, self.bottom = sys.maxint, 0
|
|
||||||
for t in self.texts:
|
|
||||||
self.top = min(self.top, t.top)
|
|
||||||
self.bottom = max(self.bottom, t.bottom)
|
|
||||||
self.left = self.texts[0].left
|
|
||||||
self.right = self.texts[-1].right
|
|
||||||
self.gaps = []
|
|
||||||
for i, t in enumerate(self.texts[1:]):
|
|
||||||
gap = Interval(self.texts[i].right, t.left)
|
|
||||||
if gap.width > 3:
|
|
||||||
self.gaps.append(gap)
|
|
||||||
left = Interval(left_margin, self.texts[0].left)
|
|
||||||
if left.width > 3:
|
|
||||||
self.gaps.insert(0, left)
|
|
||||||
right = Interval(self.texts[-1].right, right_margin)
|
|
||||||
if right.width > 3:
|
|
||||||
self.gaps.append(right)
|
|
||||||
|
|
||||||
def has_intersection_with(self, gap):
|
|
||||||
for g in self.gaps:
|
|
||||||
if g.intersection(gap):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def identify_columns(self, column_gaps):
|
|
||||||
self.number_of_columns = len(column_gaps) + 1
|
|
||||||
|
|
||||||
|
|
||||||
class Page(object):
|
class Page(object):
|
||||||
|
|
||||||
# Fraction of a character width that two strings have to be apart,
|
# Fraction of a character width that two strings have to be apart,
|
||||||
@ -141,8 +121,10 @@ class Page(object):
|
|||||||
# for them to be considered to be part of the same text fragment
|
# for them to be considered to be part of the same text fragment
|
||||||
LINE_FACTOR = 0.4
|
LINE_FACTOR = 0.4
|
||||||
|
|
||||||
|
YFUZZ = 1.5
|
||||||
|
|
||||||
def __init__(self, page, font_map, opts, log):
|
|
||||||
|
def __init__(self, page, font_map, opts, log, idc):
|
||||||
self.opts, self.log = opts, log
|
self.opts, self.log = opts, log
|
||||||
self.font_map = font_map
|
self.font_map = font_map
|
||||||
self.number = int(page.get('number'))
|
self.number = int(page.get('number'))
|
||||||
@ -154,7 +136,7 @@ class Page(object):
|
|||||||
self.left_margin, self.right_margin = self.width, 0
|
self.left_margin, self.right_margin = self.width, 0
|
||||||
|
|
||||||
for text in page.xpath('descendant::text'):
|
for text in page.xpath('descendant::text'):
|
||||||
self.texts.append(Text(text, self.font_map, self.opts, self.log))
|
self.texts.append(Text(text, self.font_map, self.opts, self.log, idc))
|
||||||
text = self.texts[-1]
|
text = self.texts[-1]
|
||||||
self.left_margin = min(text.left, self.left_margin)
|
self.left_margin = min(text.left, self.left_margin)
|
||||||
self.right_margin = max(text.right, self.right_margin)
|
self.right_margin = max(text.right, self.right_margin)
|
||||||
@ -162,16 +144,22 @@ class Page(object):
|
|||||||
self.textwidth = self.right_margin - self.left_margin
|
self.textwidth = self.right_margin - self.left_margin
|
||||||
|
|
||||||
self.font_size_stats = {}
|
self.font_size_stats = {}
|
||||||
|
self.average_text_height = 0
|
||||||
for t in self.texts:
|
for t in self.texts:
|
||||||
if t.font_size not in self.font_size_stats:
|
if t.font_size not in self.font_size_stats:
|
||||||
self.font_size_stats[t.font_size] = 0
|
self.font_size_stats[t.font_size] = 0
|
||||||
self.font_size_stats[t.font_size] += len(t.text_as_string)
|
self.font_size_stats[t.font_size] += len(t.text_as_string)
|
||||||
|
self.average_text_height += t.height
|
||||||
|
self.average_text_height /= len(self.texts)
|
||||||
|
|
||||||
self.font_size_stats = FontSizeStats(self.font_size_stats)
|
self.font_size_stats = FontSizeStats(self.font_size_stats)
|
||||||
|
|
||||||
self.coalesce_fragments()
|
self.coalesce_fragments()
|
||||||
|
|
||||||
#self.identify_columns()
|
self.elements = list(self.texts)
|
||||||
|
for img in page.xpath('descendant::img'):
|
||||||
|
self.elements.append(Image(img, self.opts, self.log, idc))
|
||||||
|
self.elements.sort(cmp=lambda x,y:cmp(x.top, y.top))
|
||||||
|
|
||||||
def coalesce_fragments(self):
|
def coalesce_fragments(self):
|
||||||
|
|
||||||
@ -196,46 +184,19 @@ class Page(object):
|
|||||||
if match is not None:
|
if match is not None:
|
||||||
self.texts.remove(match)
|
self.texts.remove(match)
|
||||||
|
|
||||||
def sort_into_horizontal_boxes(self, document_font_size_stats):
|
def first_pass(self):
|
||||||
self.horizontal_boxes = []
|
self.regions = []
|
||||||
|
if not self.elements:
|
||||||
def find_closest_match(text):
|
return
|
||||||
'Return horizontal box whose bottom is closest to text or None'
|
for i, x in enumerate(self.elements):
|
||||||
min, ans = 3.1, None
|
x.idx = i
|
||||||
for hb in self.horizontal_boxes:
|
self.current_region = None
|
||||||
diff = abs(text.bottom - hb.bottom)
|
for x in self.elements:
|
||||||
if diff < min:
|
self.find_elements_in_row_of(x)
|
||||||
diff, ans = min, hb
|
|
||||||
return ans
|
|
||||||
|
|
||||||
for t in self.texts:
|
|
||||||
hb = find_closest_match(t)
|
|
||||||
if hb is None:
|
|
||||||
self.horizontal_boxes.append(HorizontalBox(t))
|
|
||||||
else:
|
|
||||||
hb.append(t)
|
|
||||||
|
|
||||||
|
|
||||||
for hb in self.horizontal_boxes:
|
|
||||||
hb.sort(self.left_margin, self.right_margin)
|
|
||||||
|
|
||||||
self.horizontal_boxes.sort(cmp=lambda x,y: cmp(x.bottom, y.bottom))
|
|
||||||
|
|
||||||
def identify_columns(self):
|
|
||||||
|
|
||||||
def neighborhood(i):
|
|
||||||
if i == len(self.horizontal_boxes)-1:
|
|
||||||
return self.horizontal_boxes[i-2:i]
|
|
||||||
if i == len(self.horizontal_boxes)-2:
|
|
||||||
return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1])
|
|
||||||
return self.horizontal_boxes[i+1], self.horizontal_boxes[i+2]
|
|
||||||
|
|
||||||
for i, hbox in enumerate(self.horizontal_boxes):
|
|
||||||
n1, n2 = neighborhood(i)
|
|
||||||
for gap in hbox.gaps:
|
|
||||||
gap.is_column_gap = n1.has_intersection_with(gap) and \
|
|
||||||
n2.has_intersection_with(gap)
|
|
||||||
|
|
||||||
|
def find_elements_in_row_of(self, x):
|
||||||
|
interval = Interval(x.top - self.YFUZZ * self.average_text_height,
|
||||||
|
x.top + self.YFUZZ*(1+self.average_text_height))
|
||||||
|
|
||||||
|
|
||||||
class PDFDocument(object):
|
class PDFDocument(object):
|
||||||
@ -244,6 +205,7 @@ class PDFDocument(object):
|
|||||||
self.opts, self.log = opts, log
|
self.opts, self.log = opts, log
|
||||||
parser = etree.XMLParser(recover=True)
|
parser = etree.XMLParser(recover=True)
|
||||||
self.root = etree.fromstring(xml, parser=parser)
|
self.root = etree.fromstring(xml, parser=parser)
|
||||||
|
idc = iter(xrange(sys.maxint))
|
||||||
|
|
||||||
self.fonts = []
|
self.fonts = []
|
||||||
self.font_map = {}
|
self.font_map = {}
|
||||||
@ -256,14 +218,15 @@ class PDFDocument(object):
|
|||||||
self.page_map = {}
|
self.page_map = {}
|
||||||
|
|
||||||
for page in self.root.xpath('//page'):
|
for page in self.root.xpath('//page'):
|
||||||
page = Page(page, self.font_map, opts, log)
|
page = Page(page, self.font_map, opts, log, idc)
|
||||||
self.page_map[page.id] = page
|
self.page_map[page.id] = page
|
||||||
self.pages.append(page)
|
self.pages.append(page)
|
||||||
|
|
||||||
self.collect_font_statistics()
|
self.collect_font_statistics()
|
||||||
|
|
||||||
for page in self.pages:
|
for page in self.pages:
|
||||||
page.sort_into_horizontal_boxes(self.font_size_stats)
|
page.document_font_stats = self.font_size_stats
|
||||||
|
page.first_pass()
|
||||||
|
|
||||||
def collect_font_statistics(self):
|
def collect_font_statistics(self):
|
||||||
self.font_size_stats = {}
|
self.font_size_stats = {}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user