More margin/indentation processing

This has a variation on the divide-by-zero fix
This commit is contained in:
MisterAP 2024-10-26 12:59:15 +01:00
parent c2da702b0f
commit ceb6e5b45e

View File

@ -19,7 +19,7 @@ from lxml import etree
PAGE_SCAN_COUNT = 20 # Arbitrary PAGE_SCAN_COUNT = 20 # Arbitrary
LINE_SCAN_COUNT = 2 # Arbitrary LINE_SCAN_COUNT = 2 # Arbitrary
# Fraction of a character width that two strings have to be apart, # Number of character widths that two strings have to be apart,
# for them to be considered part of the same text fragment # for them to be considered part of the same text fragment
# The problem is justified text where fragments can be widely spaced # The problem is justified text where fragments can be widely spaced
# Was 0.5 but this forces just about anything to coalesce. # Was 0.5 but this forces just about anything to coalesce.
@ -71,6 +71,10 @@ RIGHT_FACTOR = 1.8
# and still be considered centered. 0.15 = 15% # and still be considered centered. 0.15 = 15%
CENTER_FACTOR = 0.15 CENTER_FACTOR = 0.15
# How near does text right need to be to right margin
# to be considered right aligned. 0.1 = 10%
RIGHT_FLOAT_FACTOR = 0.1
#### Indents and line spacing #### Indents and line spacing
# How near must pixel values be to appear the same # How near must pixel values be to appear the same
SAME_SPACE = 3.0 SAME_SPACE = 3.0
@ -149,6 +153,7 @@ class Text(Element):
self.margin_right = 0 # Normal margins self.margin_right = 0 # Normal margins
# When joining lines for a paragraph, remember position of last line joined # When joining lines for a paragraph, remember position of last line joined
self.last_left = self.left self.last_left = self.left
self.last_right = self.right
# Remember the length of this line if it is merged into a paragraph # Remember the length of this line if it is merged into a paragraph
self.final_width = self.width self.final_width = self.width
# Align = Left, Right, Center, Justified. Default L # Align = Left, Right, Center, Justified. Default L
@ -176,21 +181,37 @@ class Text(Element):
self.raw += etree.tostring(x, method='xml', encoding='unicode') self.raw += etree.tostring(x, method='xml', encoding='unicode')
self.average_character_width = self.width/len(self.text_as_string) self.average_character_width = self.width/len(self.text_as_string)
def coalesce(self, other, page_number, left_margin): @property
def is_empty(self):
# There is nothing in this Text
return len(self.raw) == 0
@property
def is_spaces(self):
# There are only spaces in this Text
return len(self.raw) > 0 \
and (re.match('^\s+$', self.raw) is not None \
or re.match('^\s*<i>\s*</i>\s*$', self.raw) is not None \
or re.match('^\s*<b>\s*</b>\s*$', self.raw) is not None)
def coalesce(self, other, page_number, left_margin, right_margin):
if self.opts.verbose > 2: if self.opts.verbose > 2:
self.log.debug('Coalescing %r with %r on page %d'%(self.text_as_string, self.log.debug('Coalescing %r with %r on page %d'%(self.text_as_string,
other.text_as_string, page_number)) other.text_as_string, page_number))
# Need to work out how to decide this # Need to work out how to decide this
# For elements of the same line, is there a space between? # For elements of the same line, is there a space between?
has_float = ''
# Spaces are narrow, so a_c_w/3 # Spaces are narrow, so a_c_w/3
# Or assume any gap = a space?
if (self.top <= other.top and self.bottom >= other.bottom) \ if (self.top <= other.top and self.bottom >= other.bottom) \
and abs(other.left - self.right) < self.average_character_width / 3.0: and abs(other.left - self.right) < 2.0:
#and abs(other.left - self.right) < self.average_character_width / 3.0:
has_gap = 0 has_gap = 0
else: # Insert n spaces to fill gap. Use TAB? Columns? else: # Insert n spaces to fill gap. Use TAB? Columns?
if other.left < self.right: if other.left < self.right:
has_gap = 1 # Coalescing different lines. 1 space has_gap = 1 # Coalescing different lines. 1 space
else: # Multiple texts on same line else: # Multiple texts on same line
has_gap = round(abs(other.left - self.right) / self.average_character_width) has_gap = round(0.5+abs(other.left - self.right) / self.average_character_width)
# Allow for super or subscript. These probably have lower heights # Allow for super or subscript. These probably have lower heights
# In this case, don't use their top/bottom # In this case, don't use their top/bottom
@ -258,6 +279,11 @@ class Text(Element):
has_gap = 1 has_gap = 1
else: else:
has_gap = 0 has_gap = 0
else: # Large gap
if other.right > right_margin - right_margin * RIGHT_FLOAT_FACTOR:
has_float = '<span style="float:right">'
has_gap = 1
#else leave has_gap
# Insert multiple spaces # Insert multiple spaces
while has_gap > 0: while has_gap > 0:
self.text_as_string += ' ' self.text_as_string += ' '
@ -304,7 +330,11 @@ class Text(Element):
#elif sub_super > 0: #elif sub_super > 0:
# other.raw = '<sub>' + other.raw + '</sub>' # other.raw = '<sub>' + other.raw + '</sub>'
if len(has_float):
self.raw += has_float
self.raw += other.raw self.raw += other.raw
if len(has_float):
self.raw += '</span>'
self.average_character_width = self.width/len(self.text_as_string) self.average_character_width = self.width/len(self.text_as_string)
#self.last_left = other.left #self.last_left = other.left
@ -696,22 +726,10 @@ class Page:
for text in page.xpath('descendant::text'): for text in page.xpath('descendant::text'):
self.texts.append(Text(text, self.font_map, self.opts, self.log, idc)) self.texts.append(Text(text, self.font_map, self.opts, self.log, idc))
text = self.texts[-1] text = self.texts[-1]
# Allow for ' <i|b|etc> ...'
# In fact, these get split by xpath, so the '<' will be at the start of a fragment
matchObj = re.match(r'^(\s*)(<[^>]+>)?(\s*)(.*)$', text.raw)
s1 = matchObj.group(1)
s2 = matchObj.group(3)
t1 = matchObj.group(2)
t2 = matchObj.group(4)
if t1 is None:
t1 = ''
if t2 is None:
t2 = ''
tx = t1 + t2
# Check within page boundaries. # Check within page boundaries.
# Remove lines of only spaces. Could be <i> </i> etc., but process later # Remove lines of only spaces. Could be <i> </i> etc., but process later
# Need to keep any href= (and others?) # Need to keep any href= (and others?)
if len(tx) == 0 \ if text.is_spaces \
or text.top < self.top \ or text.top < self.top \
or text.top > self.height \ or text.top > self.height \
or text.left > self.left+self.width \ or text.left > self.left+self.width \
@ -721,14 +739,26 @@ class Page:
elif (self.opts.pdf_header_skip <= 0 or text.top >= self.opts.pdf_header_skip) \ elif (self.opts.pdf_header_skip <= 0 or text.top >= self.opts.pdf_header_skip) \
and (self.opts.pdf_footer_skip <= 0 or text.top <= self.opts.pdf_footer_skip): and (self.opts.pdf_footer_skip <= 0 or text.top <= self.opts.pdf_footer_skip):
# Remove leading spaces and make into indent # Remove leading spaces and make into indent
# Assume 1 space = 1 av_char_width? # Assume 1 space < 1 av_char_width?
s = len(s1) + len(s2) s = 0
# Test above has shown string is not all spaces
# but it could hold no text
while s < len(text.text_as_string) \
and text.text_as_string[s] == ' ':
s += 1
if s > 2: # Allow two leading spaces if s > 2: # Allow two leading spaces
# Assume this is a standard indent # Assume this is a standard indent
# Normally text.indented gets set later # Normally text.indented gets set later
text.indented = 1 text.indented = 1
w = round(s * text.average_character_width/2.0) # Spaces < avg width w = round(s * text.average_character_width/2.0) # Spaces < avg width
text.raw = tx matchObj = re.match(r'^\s*(<[^>]+>)?\s*(.*)$', text.raw)
t1 = matchObj.group(1)
t2 = matchObj.group(2)
if t1 is None:
t1 = ''
if t2 is None:
t2 = ''
text.raw = t1 + t2
text.text_as_string = text.text_as_string[s:] text.text_as_string = text.text_as_string[s:]
text.left += w # Add indent text.left += w # Add indent
text.last_left += w text.last_left += w
@ -736,7 +766,7 @@ class Page:
text.final_width -= w text.final_width -= w
self.left_margin = min(text.left, self.left_margin) self.left_margin = min(text.left, self.left_margin)
self.right_margin = max(text.right, self.right_margin) self.right_margin = max(text.right, self.right_margin)
# Change #nnn to page_nnn in hrefs # Change #nnn to #page_nnn in hrefs
matchObj = re.match(r'^(.*)(<a href)(.+)("index.html#)(\d+)(".+)$', text.raw) matchObj = re.match(r'^(.*)(<a href)(.+)("index.html#)(\d+)(".+)$', text.raw)
if matchObj is not None: if matchObj is not None:
text.raw = matchObj.group(1)+matchObj.group(2)+matchObj.group(3)+matchObj.group(4) \ text.raw = matchObj.group(1)+matchObj.group(2)+matchObj.group(3)+matchObj.group(4) \
@ -773,6 +803,12 @@ class Page:
self.font_size_stats = FontSizeStats(self.font_size_stats) self.font_size_stats = FontSizeStats(self.font_size_stats)
@property
def is_empty(self):
# There is nothing in this Page
return len(self.texts) == 0 \
and len(self.imgs) == 0
def find_match(self, frag): def find_match(self, frag):
for t in self.texts: for t in self.texts:
# and abs(t.bottom - frag.bottom) <= BOTTOM_FACTOR : # and abs(t.bottom - frag.bottom) <= BOTTOM_FACTOR :
@ -785,7 +821,7 @@ class Page:
or (frag.top < t.top and frag.bottom+BOTTOM_FACTOR > t.bottom) \ or (frag.top < t.top and frag.bottom+BOTTOM_FACTOR > t.bottom) \
or (t.top < frag.top and t.bottom > frag.top+BOTTOM_FACTOR) \ or (t.top < frag.top and t.bottom > frag.top+BOTTOM_FACTOR) \
or (t.top < frag.top and t.bottom+BOTTOM_FACTOR > frag.bottom): or (t.top < frag.top and t.bottom+BOTTOM_FACTOR > frag.bottom):
#return t # Force match if same line return t # Force match if same line
# Sorting can put parts of a line in the wrong order if there are small chars # Sorting can put parts of a line in the wrong order if there are small chars
if t.left < frag.left: if t.left < frag.left:
hdelta = frag.left - t.right hdelta = frag.left - t.right
@ -815,7 +851,7 @@ class Page:
x = frag x = frag
frag = match frag = match
match = x match = x
frag.coalesce(match, self.number, self.left_margin) frag.coalesce(match, self.number, self.left_margin, self.right_margin)
break # Leave tind break # Leave tind
tind += 1 tind += 1
if match is not None: if match is not None:
@ -845,14 +881,14 @@ class Page:
m = len(self.texts) m = len(self.texts)
for i in range(m): for i in range(m):
t = self.texts[i] t = self.texts[i]
lmargin = t.left lmargin = t.last_left # Allow for lines joined into a para
rmargin = self.width - t.right rmargin = self.width - t.last_right
# Do we have a sequence of indented lines? # Do we have a sequence of indented lines?
xmargin = ymargin = -1 xmargin = ymargin = -1
if i > 0: if i > 0:
xmargin = self.texts[i-1].left xmargin = self.texts[i-1].last_left
if i < m-1: if i < m-1:
ymargin = self.texts[i+1].left ymargin = self.texts[i+1].last_left
# Don't want to set headings on a Contents page # Don't want to set headings on a Contents page
# NB Doesn't work where Contents goes to another page # NB Doesn't work where Contents goes to another page
@ -940,6 +976,7 @@ class Page:
or (second_text.left >= first_text.last_left \ or (second_text.left >= first_text.last_left \
and second_text.bottom <= first_text.bottom)) \ and second_text.bottom <= first_text.bottom)) \
and 'href=' not in second_text.raw \ and 'href=' not in second_text.raw \
and '"float:right"' not in first_text.raw \
and first_text.bottom + stats.line_space + (stats.line_space*LINE_FACTOR) \ and first_text.bottom + stats.line_space + (stats.line_space*LINE_FACTOR) \
>= second_text.bottom \ >= second_text.bottom \
and first_text.final_width > self.width*self.opts.unwrap_factor \ and first_text.final_width > self.width*self.opts.unwrap_factor \
@ -965,16 +1002,18 @@ class Page:
while tind < len(self.texts): while tind < len(self.texts):
frag = self.texts[tind] frag = self.texts[tind]
# Remove lines of only spaces # Remove lines of only spaces
if re.match(r'^\s+$', frag.raw) is not None: if frag.is_spaces:
match = frag match = frag
break # Leave tind break # Leave tind
if last_frag is not None \ if last_frag is not None \
and frag != last_frag \ and frag != last_frag \
and can_merge(self, last_frag, frag, stats): and can_merge(self, last_frag, frag, stats):
last_frag.coalesce(frag, self.number, self.left_margin) last_frag.coalesce(frag, self.number, self.left_margin, self.right_margin)
last_frag.last_left = frag.left last_frag.last_left = frag.left
last_frag.last_right = frag.right
last_frag.final_width = frag.final_width last_frag.final_width = frag.final_width
# Check for centred done later
match = frag match = frag
break # Leave tind break # Leave tind
else: else:
@ -990,10 +1029,8 @@ class Page:
frag.indented = 1 # 1em frag.indented = 1 # 1em
else: # Assume left margin of approx = number of chars else: # Assume left margin of approx = number of chars
# Should check for values approx the same, as with indents # Should check for values approx the same, as with indents
try: #frag.margin_left = int(round((frag.left - left) / frag.average_character_width)+0.5)
frag.margin_left = int(round((frag.left - left) / frag.average_character_width)+0.5) frag.margin_left = int(round((frag.left - left) / self.stats_margin_px)+0.5)
except ZeroDivisionError:
frag.margin_left = 0
if last_frag is not None \ if last_frag is not None \
and frag.bottom - last_frag.bottom \ and frag.bottom - last_frag.bottom \
> stats.para_space*SECTION_FACTOR: > stats.para_space*SECTION_FACTOR:
@ -1245,7 +1282,10 @@ class Page:
self.stats_left_even = stats.left_even self.stats_left_even = stats.left_even
self.stats_indent_even = stats.indent_even self.stats_indent_even = stats.indent_even
self.stats_indent_even1 = stats.indent_even1 self.stats_indent_even1 = stats.indent_even1
self.stats_right = stats.right self.stats_right = stats.right # Needs work
self.stats_right_odd = stats.right
self.stats_right_even = stats.right
self.stats_margin_px = stats.margin_px
# Join lines to form paragraphs # Join lines to form paragraphs
self.coalesce_paras(stats) self.coalesce_paras(stats)
@ -1436,6 +1476,8 @@ class PDFDocument:
self.stats.font_size = self.fonts[f_ind].size self.stats.font_size = self.fonts[f_ind].size
else: else:
self.stats.font_size = 12.0 self.stats.font_size = 12.0
# 1em of a 12pt font is about 16px. Save for indentation/margin setting
self.stats.margin_px = max(self.stats.font_size * 16.0/12.0, 1.0) # Ensure never zero
for f in self.fonts: for f in self.fonts:
f.size_em = round(f.size / self.stats.font_size, 2) f.size_em = round(f.size / self.stats.font_size, 2)
@ -1701,25 +1743,19 @@ class PDFDocument:
head_text = [''] * LINE_SCAN_COUNT head_text = [''] * LINE_SCAN_COUNT
head_match = [0] * LINE_SCAN_COUNT head_match = [0] * LINE_SCAN_COUNT
head_match1 = [0] * LINE_SCAN_COUNT head_match1 = [0] * LINE_SCAN_COUNT
#head_text = ''
#head_match = 0
#head_match1 = 0
head_page = 0 head_page = 0
head_skip = 0 head_skip = 0
foot_text = [''] * LINE_SCAN_COUNT foot_text = [''] * LINE_SCAN_COUNT
foot_match = [0] * LINE_SCAN_COUNT foot_match = [0] * LINE_SCAN_COUNT
foot_match1 = [0] * LINE_SCAN_COUNT foot_match1 = [0] * LINE_SCAN_COUNT
#foot_text = ''
#foot_match = 0
#foot_match1 = 0
foot_page = 0 foot_page = 0
foot_skip = 0 foot_skip = 0
pagenum_text = r'(.*\d+\s+\w+\s+\d+.*)|(\s*\d+\s+.*)|(^\s*[ivxlcIVXLC]+\s*$)' pagenum_text = r'(.*\d+\s+\w+\s+\d+.*)|(\s*\d+\s+.*)|(^\s*[ivxlcIVXLC]+\s*$)'
pages_to_scan = scan_count pages_to_scan = scan_count
# Note the a line may be in more than 1 part # Note that a line may be in more than 1 part
# e.g. Page 1 of 6 ... DocName.pdf # e.g. Page 1 of 6 ... DocName.pdf
# so should merge first 2 lines if same top # so maybe should merge first 2 lines if same top
# Ditto last 2 lines # Ditto last 2 lines
# Maybe should do more than 2 parts # Maybe should do more than 2 parts
for page in self.pages: for page in self.pages:
@ -1924,9 +1960,11 @@ class PDFDocument:
# The text has been appended to this page, so coalesce the paragraph # The text has been appended to this page, so coalesce the paragraph
if merged_page.odd_even: if merged_page.odd_even:
left_margin = merged_page.stats_left_odd left_margin = merged_page.stats_left_odd
right_margin = merged_page.stats_right_odd
else: else:
left_margin = merged_page.stats_left_even left_margin = merged_page.stats_left_even
candidate.texts[-1].coalesce(merged_text, candidate.number, left_margin) right_margin = merged_page.stats_right_odd
candidate.texts[-1].coalesce(merged_text, candidate.number, left_margin, right_margin)
merged_page.texts.remove(merged_text) merged_page.texts.remove(merged_text)
# Put back top/bottom after coalesce if final line # Put back top/bottom after coalesce if final line
if save_bottom != 0.0 : if save_bottom != 0.0 :
@ -1935,8 +1973,7 @@ class PDFDocument:
candidate.texts[-1].bottom = save_bottom candidate.texts[-1].bottom = save_bottom
#candidate.coalesce_paras() #candidate.coalesce_paras()
# Have we removed everything from this page (well, all texts and images) # Have we removed everything from this page (well, all texts and images)
if len(merged_page.texts) == 0 \ if merged_page.is_empty:
and len(merged_page.imgs) == 0:
candidate.texts[-1].blank_line_before = 1 candidate.texts[-1].blank_line_before = 1
self.pages.remove(merged_page) self.pages.remove(merged_page)