From b632639ff790a77d741821952b77a54cefae633c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Dec 2009 08:47:37 -0700 Subject: [PATCH] ... --- src/calibre/ebooks/pdf/reflow.py | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 8cef0f327d..7afbb62b45 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -23,6 +23,7 @@ class Text(object): self.font_map = font_map self.top, self.left, self.width, self.height = map(float, map(text.get, ('top', 'left', 'width', 'height'))) + self.bottom = self.top + self.height self.font = self.font_map[text.get('font')] self.font_size = self.font.size self.color = self.font.color @@ -31,6 +32,18 @@ class Text(object): self.text_as_string = etree.tostring(text, method='text', encoding=unicode) +class FontSizeStats(dict): + + def __init__(self, stats): + total = float(sum(stats.values())) + self.most_common_size, self.chars_at_most_common_size = -1, 0 + + for sz, chars in stats.items(): + if chars >= self.chars_at_most_common_size: + self.most_common_size, self.chars_at_most_common_size = sz, chars + self[sz] = chars/total + + class Page(object): def __init__(self, page, font_map, opts, log): @@ -46,6 +59,15 @@ class Page(object): for text in page.xpath('descendant::text'): self.texts.append(Text(text, self.font_map, self.opts, self.log)) + self.font_size_stats = {} + for t in self.texts: + if t.font_size not in self.font_size_stats: + self.font_size_stats[t.font_size] = 0 + self.font_size_stats[t.font_size] += len(t.text_as_string) + + self.font_size_stats = FontSizeStats(self.font_size_stats) + + class PDFDocument(object): @@ -69,6 +91,17 @@ class PDFDocument(object): self.page_map[page.id] = page self.pages.append(page) + self.collect_font_statistics() + + def collect_font_statistics(self): + self.font_size_stats = {} + for p in self.pages: + for sz, chars in p.font_size_stats: + if sz not in self.font_size_stats: + self.font_size_stats[sz] = 0 + self.font_size_stats[sz] += chars + + self.font_size_stats = FontSizeStats(self.font_size_stats)