Some progress on PDF multicol support

This commit is contained in:
Kovid Goyal 2010-01-07 16:43:50 -07:00
parent 92f66311c6
commit 8117551bae
2 changed files with 51 additions and 88 deletions

View File

@ -15,7 +15,7 @@ pdfreflow, pdfreflow_err = plugins['pdfreflow']
class PDFInput(InputFormatPlugin):
name = 'PDF Input'
author = 'John Schember'
author = 'Kovid Goyal and John Schember'
description = 'Convert PDF files to HTML'
file_types = set(['pdf'])

View File

@ -18,9 +18,30 @@ class Font(object):
self.color = spec.get('color')
self.family = spec.get('family')
class Text(object):
class Element(object):
def __init__(self, text, font_map, opts, log):
def __eq__(self, other):
return self.id == other.id
def __hash__(self):
return hash(self.id)
class Image(Element):
def __init__(self, img, opts, log, idc):
self.opts, self.log = opts, log
self.id = idc.next()
self.top, self.left, self.width, self.height, self.iwidth, self.iheight = \
map(float, map(img.get, ('top', 'left', 'rwidth', 'rheight', 'iwidth',
'iheight')))
self.src = img.get('src')
class Text(Element):
def __init__(self, text, font_map, opts, log, idc):
self.id = idc.next()
self.opts, self.log = opts, log
self.font_map = font_map
self.top, self.left, self.width, self.height = map(float, map(text.get,
@ -90,47 +111,6 @@ class Interval(object):
return hash('(%f,%f)'%self.left, self.right)
class HorizontalBox(object):
def __init__(self, base_text):
self.texts = [base_text]
self.bottom = base_text.bottom
self.number_of_columns = None
self.column_map = {}
def append(self, t):
self.texts.append(t)
def sort(self, left_margin, right_margin):
self.texts.sort(cmp=lambda x,y: cmp(x.left, y.left))
self.top, self.bottom = sys.maxint, 0
for t in self.texts:
self.top = min(self.top, t.top)
self.bottom = max(self.bottom, t.bottom)
self.left = self.texts[0].left
self.right = self.texts[-1].right
self.gaps = []
for i, t in enumerate(self.texts[1:]):
gap = Interval(self.texts[i].right, t.left)
if gap.width > 3:
self.gaps.append(gap)
left = Interval(left_margin, self.texts[0].left)
if left.width > 3:
self.gaps.insert(0, left)
right = Interval(self.texts[-1].right, right_margin)
if right.width > 3:
self.gaps.append(right)
def has_intersection_with(self, gap):
for g in self.gaps:
if g.intersection(gap):
return True
return False
def identify_columns(self, column_gaps):
self.number_of_columns = len(column_gaps) + 1
class Page(object):
# Fraction of a character width that two strings have to be apart,
@ -141,8 +121,10 @@ class Page(object):
# for them to be considered to be part of the same text fragment
LINE_FACTOR = 0.4
YFUZZ = 1.5
def __init__(self, page, font_map, opts, log):
def __init__(self, page, font_map, opts, log, idc):
self.opts, self.log = opts, log
self.font_map = font_map
self.number = int(page.get('number'))
@ -154,7 +136,7 @@ class Page(object):
self.left_margin, self.right_margin = self.width, 0
for text in page.xpath('descendant::text'):
self.texts.append(Text(text, self.font_map, self.opts, self.log))
self.texts.append(Text(text, self.font_map, self.opts, self.log, idc))
text = self.texts[-1]
self.left_margin = min(text.left, self.left_margin)
self.right_margin = max(text.right, self.right_margin)
@ -162,16 +144,22 @@ class Page(object):
self.textwidth = self.right_margin - self.left_margin
self.font_size_stats = {}
self.average_text_height = 0
for t in self.texts:
if t.font_size not in self.font_size_stats:
self.font_size_stats[t.font_size] = 0
self.font_size_stats[t.font_size] += len(t.text_as_string)
self.average_text_height += t.height
self.average_text_height /= len(self.texts)
self.font_size_stats = FontSizeStats(self.font_size_stats)
self.coalesce_fragments()
#self.identify_columns()
self.elements = list(self.texts)
for img in page.xpath('descendant::img'):
self.elements.append(Image(img, self.opts, self.log, idc))
self.elements.sort(cmp=lambda x,y:cmp(x.top, y.top))
def coalesce_fragments(self):
@ -196,46 +184,19 @@ class Page(object):
if match is not None:
self.texts.remove(match)
def sort_into_horizontal_boxes(self, document_font_size_stats):
self.horizontal_boxes = []
def find_closest_match(text):
'Return horizontal box whose bottom is closest to text or None'
min, ans = 3.1, None
for hb in self.horizontal_boxes:
diff = abs(text.bottom - hb.bottom)
if diff < min:
diff, ans = min, hb
return ans
for t in self.texts:
hb = find_closest_match(t)
if hb is None:
self.horizontal_boxes.append(HorizontalBox(t))
else:
hb.append(t)
for hb in self.horizontal_boxes:
hb.sort(self.left_margin, self.right_margin)
self.horizontal_boxes.sort(cmp=lambda x,y: cmp(x.bottom, y.bottom))
def identify_columns(self):
def neighborhood(i):
if i == len(self.horizontal_boxes)-1:
return self.horizontal_boxes[i-2:i]
if i == len(self.horizontal_boxes)-2:
return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1])
return self.horizontal_boxes[i+1], self.horizontal_boxes[i+2]
for i, hbox in enumerate(self.horizontal_boxes):
n1, n2 = neighborhood(i)
for gap in hbox.gaps:
gap.is_column_gap = n1.has_intersection_with(gap) and \
n2.has_intersection_with(gap)
def first_pass(self):
self.regions = []
if not self.elements:
return
for i, x in enumerate(self.elements):
x.idx = i
self.current_region = None
for x in self.elements:
self.find_elements_in_row_of(x)
def find_elements_in_row_of(self, x):
interval = Interval(x.top - self.YFUZZ * self.average_text_height,
x.top + self.YFUZZ*(1+self.average_text_height))
class PDFDocument(object):
@ -244,6 +205,7 @@ class PDFDocument(object):
self.opts, self.log = opts, log
parser = etree.XMLParser(recover=True)
self.root = etree.fromstring(xml, parser=parser)
idc = iter(xrange(sys.maxint))
self.fonts = []
self.font_map = {}
@ -256,14 +218,15 @@ class PDFDocument(object):
self.page_map = {}
for page in self.root.xpath('//page'):
page = Page(page, self.font_map, opts, log)
page = Page(page, self.font_map, opts, log, idc)
self.page_map[page.id] = page
self.pages.append(page)
self.collect_font_statistics()
for page in self.pages:
page.sort_into_horizontal_boxes(self.font_size_stats)
page.document_font_stats = self.font_size_stats
page.first_pass()
def collect_font_statistics(self):
self.font_size_stats = {}