This commit is contained in:
Kovid Goyal 2024-11-07 20:26:34 +05:30
commit 5d6911a503
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -179,7 +179,7 @@ class Text(Element):
self.raw = text.text if text.text else '' self.raw = text.text if text.text else ''
for x in text.iterchildren(): for x in text.iterchildren():
self.raw += etree.tostring(x, method='xml', encoding='unicode') self.raw += etree.tostring(x, method='xml', encoding='unicode')
self.average_character_width = self.width/len(self.text_as_string) self.set_av_char_width()
@property @property
def is_empty(self): def is_empty(self):
@ -195,6 +195,9 @@ class Text(Element):
re.match(r'^\s*<b>\s*</b>\s*$', self.raw) is not None re.match(r'^\s*<b>\s*</b>\s*$', self.raw) is not None
) )
def set_av_char_width(self):
self.average_character_width = max(self.width/len(self.text_as_string),0.1) # Ensure never zero
def coalesce(self, other, page_number, left_margin, right_margin): def coalesce(self, other, page_number, left_margin, right_margin):
if self.opts.verbose > 2: if self.opts.verbose > 2:
self.log.debug('Coalescing %r with %r on page %d'%(self.text_as_string, self.log.debug('Coalescing %r with %r on page %d'%(self.text_as_string,
@ -352,7 +355,7 @@ class Text(Element):
self.raw += other.raw self.raw += other.raw
if has_float: if has_float:
self.raw += '</span>' self.raw += '</span>'
self.average_character_width = self.width/len(self.text_as_string) self.set_av_char_width()
#self.last_left = other.left #self.last_left = other.left
def to_html(self): def to_html(self):
@ -390,7 +393,7 @@ class Paragraph(Text):
self.raw = text.text if text.text else '' self.raw = text.text if text.text else ''
for x in text.iterchildren(): for x in text.iterchildren():
self.raw += etree.tostring(x, method='xml', encoding='unicode') self.raw += etree.tostring(x, method='xml', encoding='unicode')
self.average_character_width = self.width/len(self.text_as_string) self.set_av_char_width()
def to_html(self): def to_html(self):
return self.raw return self.raw
@ -1832,7 +1835,8 @@ class PDFDocument:
for i in range(LINE_SCAN_COUNT): for i in range(LINE_SCAN_COUNT):
if head_match[i] > pages_to_scan or head_match1[i] > pages_to_scan: if head_match[i] > pages_to_scan or head_match1[i] > pages_to_scan:
head_ind = i # Remember the last matching line head_ind = i # Remember the last matching line
if head_match[head_ind] > pages_to_scan or head_match1[head_ind] > pages_to_scan: if self.pages[head_page].texts \
and (head_match[head_ind] > pages_to_scan or head_match1[head_ind] > pages_to_scan):
t = self.pages[head_page].texts[head_ind] t = self.pages[head_page].texts[head_ind]
head_skip = t.top + t.height + 1 head_skip = t.top + t.height + 1
@ -1840,7 +1844,8 @@ class PDFDocument:
for i in range(LINE_SCAN_COUNT): for i in range(LINE_SCAN_COUNT):
if foot_match[i] > pages_to_scan or foot_match1[i] > pages_to_scan: if foot_match[i] > pages_to_scan or foot_match1[i] > pages_to_scan:
foot_ind = i # Remember the last matching line foot_ind = i # Remember the last matching line
if foot_match[foot_ind] > pages_to_scan or foot_match1[foot_ind] > pages_to_scan: if self.pages[foot_page].texts \
and (foot_match[foot_ind] > pages_to_scan or foot_match1[foot_ind] > pages_to_scan):
t = self.pages[foot_page].texts[-foot_ind-1] t = self.pages[foot_page].texts[-foot_ind-1]
foot_skip = t.top - 1 foot_skip = t.top - 1