This commit is contained in:
Kovid Goyal 2009-12-15 08:47:37 -07:00
parent 093404e208
commit b632639ff7

View File

@ -23,6 +23,7 @@ class Text(object):
self.font_map = font_map
self.top, self.left, self.width, self.height = map(float, map(text.get,
('top', 'left', 'width', 'height')))
self.bottom = self.top + self.height
self.font = self.font_map[text.get('font')]
self.font_size = self.font.size
self.color = self.font.color
@ -31,6 +32,18 @@ class Text(object):
self.text_as_string = etree.tostring(text, method='text',
encoding=unicode)
class FontSizeStats(dict):
def __init__(self, stats):
total = float(sum(stats.values()))
self.most_common_size, self.chars_at_most_common_size = -1, 0
for sz, chars in stats.items():
if chars >= self.chars_at_most_common_size:
self.most_common_size, self.chars_at_most_common_size = sz, chars
self[sz] = chars/total
class Page(object):
def __init__(self, page, font_map, opts, log):
@ -46,6 +59,15 @@ class Page(object):
for text in page.xpath('descendant::text'):
self.texts.append(Text(text, self.font_map, self.opts, self.log))
self.font_size_stats = {}
for t in self.texts:
if t.font_size not in self.font_size_stats:
self.font_size_stats[t.font_size] = 0
self.font_size_stats[t.font_size] += len(t.text_as_string)
self.font_size_stats = FontSizeStats(self.font_size_stats)
class PDFDocument(object):
@ -69,6 +91,17 @@ class PDFDocument(object):
self.page_map[page.id] = page
self.pages.append(page)
self.collect_font_statistics()
def collect_font_statistics(self):
self.font_size_stats = {}
for p in self.pages:
for sz, chars in p.font_size_stats:
if sz not in self.font_size_stats:
self.font_size_stats[sz] = 0
self.font_size_stats[sz] += chars
self.font_size_stats = FontSizeStats(self.font_size_stats)