Ensure all re.match strings start r'

This commit is contained in:
MisterAP 2024-10-28 20:43:32 +00:00
parent 80b60a7389
commit 9dd3df9eb8

View File

@ -243,17 +243,17 @@ class Text(Element):
if self.font_size_em == other.font_size_em \ if self.font_size_em == other.font_size_em \
and False \ and False \
and self.font.id == other.font.id \ and self.font.id == other.font.id \
and re.match('<span style="font-size:', self.raw) is not None \ and re.match(r'<span style="font-size:', self.raw) is not None \
and re.match('<span style="font-size:', other.raw) is not None : and re.match(r'<span style="font-size:', other.raw) is not None :
# We have the same class, so merge # We have the same class, so merge
m_self = re.match('^(.+)</span>$', self.raw) m_self = re.match(r'^(.+)</span>$', self.raw)
m_other = re.match('^<span style="font-size:.+em">(.+</span>)$', other.raw) m_other = re.match(r'^<span style="font-size:.+em">(.+</span>)$', other.raw)
if m_self and m_other: if m_self and m_other:
self.raw = m_self.group(1) self.raw = m_self.group(1)
other.raw = m_other.group(1) other.raw = m_other.group(1)
elif self.font_size_em != other.font_size_em \ elif self.font_size_em != other.font_size_em \
and self.font_size_em != 1.00 : and self.font_size_em != 1.00 :
if re.match('<span', self.raw) is None : if re.match(r'<span', self.raw) is None :
self.raw = '<span style="font-size:%sem">%s</span>'%(str(self.font_size_em),self.raw) self.raw = '<span style="font-size:%sem">%s</span>'%(str(self.font_size_em),self.raw)
# Try to allow for a very large initial character # Try to allow for a very large initial character
elif len(self.text_as_string) <= 2 \ elif len(self.text_as_string) <= 2 \
@ -263,7 +263,7 @@ class Text(Element):
# The line height gets set to the same as other parts of the file # The line height gets set to the same as other parts of the file
# and the font size is reduced. # and the font size is reduced.
# These need to be fixed manually. # These need to be fixed manually.
m_self = re.match('^(.+em">)(.+)$', self.raw) m_self = re.match(r'^(.+em">)(.+)$', self.raw)
self.raw = m_self.group(1) \ self.raw = m_self.group(1) \
+ '<span style="float:left"><span style="line-height:0.5">' \ + '<span style="float:left"><span style="line-height:0.5">' \
+ m_self.group(2) + '</span></span>' + m_self.group(2) + '</span></span>'
@ -289,7 +289,7 @@ class Text(Element):
has_float = '<span style="float:right">' has_float = '<span style="float:right">'
has_gap = 1 has_gap = 1
#else leave has_gap #else leave has_gap
old_float = re.match('^(.*)(<span style="float:right">.*)</span>\s*$', self.raw) old_float = re.match(r'^(.*)(<span style="float:right">.*)</span>\s*$', self.raw)
if old_float: if old_float:
# There is already a float as parts of a line are near the right. # There is already a float as parts of a line are near the right.
# Remove the </span> and put it after this part # Remove the </span> and put it after this part
@ -322,7 +322,7 @@ class Text(Element):
# Note that the 2 parts could have different font sizes # Note that the 2 parts could have different font sizes
matchObj = re.match(r'^([^<]*)(<span[^>]*>)*(<a href[^>]+>)(.*)</a>(</span>)*(\s*)$', self.raw) matchObj = re.match(r'^([^<]*)(<span[^>]*>)*(<a href[^>]+>)(.*)</a>(</span>)*(\s*)$', self.raw)
if matchObj is not None : if matchObj is not None :
otherObj = re.match('^([^<]*)(<span[^>]*>)*(<a href[^>]+>)(.*)(</a>)(</span>)*(.*)$', other.raw) otherObj = re.match(r'^([^<]*)(<span[^>]*>)*(<a href[^>]+>)(.*)(</a>)(</span>)*(.*)$', other.raw)
# There is another href, but is it for the same place? # There is another href, but is it for the same place?
if otherObj is not None and matchObj.group(3) == otherObj.group(3) : if otherObj is not None and matchObj.group(3) == otherObj.group(3) :
m2 = matchObj.group(2) m2 = matchObj.group(2)
@ -983,7 +983,7 @@ class Page:
# #
# The left can wander by a few (SAME_INDENT) pixels. # The left can wander by a few (SAME_INDENT) pixels.
# "float:left" occurs where there is a multi-line character, so indentation is messed up # "float:left" occurs where there is a multi-line character, so indentation is messed up
lchar = re.match('.*([^ ])\s*$', first_text.text_as_string) lchar = re.match(r'.*([^ ])\s*$', first_text.text_as_string)
last_char = ' ' # Nothing interesting last_char = ' ' # Nothing interesting
if lchar is not None: if lchar is not None:
last_char = lchar.group(1) # Final non-space char last_char = lchar.group(1) # Final non-space char
@ -1925,8 +1925,8 @@ class PDFDocument:
else: else:
merged_len = merged_text.right merged_len = merged_text.right
# Allow where the last line ends with or next line starts with lower case. # Allow where the last line ends with or next line starts with lower case.
if re.match('.*[a-z, -]$', last_line.text_as_string) is not None \ if re.match(r'.*[a-z, -]$', last_line.text_as_string) is not None \
or re.match('^[a-z, -]', merged_text.text_as_string) is not None : or re.match(r'^[a-z, -]', merged_text.text_as_string) is not None :
merged_len = merged_text.right merged_len = merged_text.right
# To use merged_len etc. # To use merged_len etc.
@ -1936,8 +1936,8 @@ class PDFDocument:
and 'href=' not in merged_text.raw \ and 'href=' not in merged_text.raw \
and merged_text.left < stats_left + merged_text.average_character_width \ and merged_text.left < stats_left + merged_text.average_character_width \
and not last_spare > merged_len \ and not last_spare > merged_len \
and not (re.match('.*[.!?](\u201d|”)$', last_line.text_as_string) is not None and not (re.match(r'.*[.!?](\u201d|”)$', last_line.text_as_string) is not None
and re.match('^(\u201c|“).*', merged_text.text_as_string) is not None): and re.match(r'^(\u201c|“).*', merged_text.text_as_string) is not None):
merge_done = True merge_done = True
# We don't want to merge partial pages # We don't want to merge partial pages
# i.e. if this is the last line, preserve its top/bottom till after merge # i.e. if this is the last line, preserve its top/bottom till after merge
@ -1970,7 +1970,7 @@ class PDFDocument:
and (len(self.pages[pind+1].texts) == 0 \ and (len(self.pages[pind+1].texts) == 0 \
or self.pages[pind+1].texts[0].top > self.pages[pind+1].imgs[0].top)): or self.pages[pind+1].texts[0].top > self.pages[pind+1].imgs[0].top)):
page.page_break_after = True page.page_break_after = True
elif (re.match('.*[a-z, ]$', last_line.text_as_string) is not None \ elif (re.match(r'.*[a-z, ]$', last_line.text_as_string) is not None \
or last_line.final_width > page.width*self.opts.unwrap_factor): or last_line.final_width > page.width*self.opts.unwrap_factor):
# or (last_line.right * 100.0 / page.right_margin) > LAST_LINE_PERCENT): # or (last_line.right * 100.0 / page.right_margin) > LAST_LINE_PERCENT):
candidate = page candidate = page