mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
...
This commit is contained in:
parent
093404e208
commit
b632639ff7
@ -23,6 +23,7 @@ class Text(object):
|
|||||||
self.font_map = font_map
|
self.font_map = font_map
|
||||||
self.top, self.left, self.width, self.height = map(float, map(text.get,
|
self.top, self.left, self.width, self.height = map(float, map(text.get,
|
||||||
('top', 'left', 'width', 'height')))
|
('top', 'left', 'width', 'height')))
|
||||||
|
self.bottom = self.top + self.height
|
||||||
self.font = self.font_map[text.get('font')]
|
self.font = self.font_map[text.get('font')]
|
||||||
self.font_size = self.font.size
|
self.font_size = self.font.size
|
||||||
self.color = self.font.color
|
self.color = self.font.color
|
||||||
@ -31,6 +32,18 @@ class Text(object):
|
|||||||
self.text_as_string = etree.tostring(text, method='text',
|
self.text_as_string = etree.tostring(text, method='text',
|
||||||
encoding=unicode)
|
encoding=unicode)
|
||||||
|
|
||||||
|
class FontSizeStats(dict):
|
||||||
|
|
||||||
|
def __init__(self, stats):
|
||||||
|
total = float(sum(stats.values()))
|
||||||
|
self.most_common_size, self.chars_at_most_common_size = -1, 0
|
||||||
|
|
||||||
|
for sz, chars in stats.items():
|
||||||
|
if chars >= self.chars_at_most_common_size:
|
||||||
|
self.most_common_size, self.chars_at_most_common_size = sz, chars
|
||||||
|
self[sz] = chars/total
|
||||||
|
|
||||||
|
|
||||||
class Page(object):
|
class Page(object):
|
||||||
|
|
||||||
def __init__(self, page, font_map, opts, log):
|
def __init__(self, page, font_map, opts, log):
|
||||||
@ -46,6 +59,15 @@ class Page(object):
|
|||||||
for text in page.xpath('descendant::text'):
|
for text in page.xpath('descendant::text'):
|
||||||
self.texts.append(Text(text, self.font_map, self.opts, self.log))
|
self.texts.append(Text(text, self.font_map, self.opts, self.log))
|
||||||
|
|
||||||
|
self.font_size_stats = {}
|
||||||
|
for t in self.texts:
|
||||||
|
if t.font_size not in self.font_size_stats:
|
||||||
|
self.font_size_stats[t.font_size] = 0
|
||||||
|
self.font_size_stats[t.font_size] += len(t.text_as_string)
|
||||||
|
|
||||||
|
self.font_size_stats = FontSizeStats(self.font_size_stats)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PDFDocument(object):
|
class PDFDocument(object):
|
||||||
|
|
||||||
@ -69,6 +91,17 @@ class PDFDocument(object):
|
|||||||
self.page_map[page.id] = page
|
self.page_map[page.id] = page
|
||||||
self.pages.append(page)
|
self.pages.append(page)
|
||||||
|
|
||||||
|
self.collect_font_statistics()
|
||||||
|
|
||||||
|
def collect_font_statistics(self):
|
||||||
|
self.font_size_stats = {}
|
||||||
|
for p in self.pages:
|
||||||
|
for sz, chars in p.font_size_stats:
|
||||||
|
if sz not in self.font_size_stats:
|
||||||
|
self.font_size_stats[sz] = 0
|
||||||
|
self.font_size_stats[sz] += chars
|
||||||
|
|
||||||
|
self.font_size_stats = FontSizeStats(self.font_size_stats)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user