From 07500e1ea344416a7fb51897334b3d28ab38a683 Mon Sep 17 00:00:00 2001 From: un-pogaz <46523284+un-pogaz@users.noreply.github.com> Date: Fri, 24 Jan 2025 11:14:20 +0100 Subject: [PATCH] always use raw-string for regex (extra-edit) --- recipes/birmingham_evening_mail.recipe | 3 +- recipes/standardmedia_ke.recipe | 2 +- recipes/zeitde_sub.recipe | 12 +-- resources/default_tweaks.py | 8 +- src/calibre/__init__.py | 8 +- src/calibre/db/fts/text.py | 2 +- .../ebooks/conversion/plugins/rtf_input.py | 4 +- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/conversion/utils.py | 76 +++++++++---------- src/calibre/ebooks/lrf/html/convert_from.py | 2 +- src/calibre/ebooks/metadata/sources/amazon.py | 4 +- .../ebooks/metadata/sources/edelweiss.py | 4 +- src/calibre/ebooks/mobi/reader/mobi6.py | 10 +-- src/calibre/ebooks/readability/readability.py | 12 +-- src/calibre/ebooks/rtf/rtfml.py | 2 +- src/calibre/ebooks/snb/snbml.py | 4 +- src/calibre/ebooks/textile/functions.py | 2 +- src/calibre/ebooks/textile/unsmarten.py | 6 +- src/calibre/ebooks/txt/processor.py | 2 +- src/calibre/ebooks/txt/textileml.py | 4 +- src/calibre/gui2/store/search/models.py | 2 +- src/calibre/gui2/store/search/search.py | 2 +- src/calibre/gui2/store/stores/empik_plugin.py | 2 +- src/calibre/library/catalogs/csv_xml.py | 2 +- src/calibre/srv/content.py | 4 +- src/calibre/web/fetch/simple.py | 2 +- 26 files changed, 89 insertions(+), 94 deletions(-) diff --git a/recipes/birmingham_evening_mail.recipe b/recipes/birmingham_evening_mail.recipe index 3f4d858d54..8549dd2216 100644 --- a/recipes/birmingham_evening_mail.recipe +++ b/recipes/birmingham_evening_mail.recipe @@ -57,8 +57,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): def get_cover_url(self): soup = self.index_to_soup('http://www.birminghammail.co.uk') - cov = soup.find(attrs={'src': re.compile( - r'http://images.icnetwork.co.uk/upl/birm')}) + cov = soup.find(attrs={'src': re.compile(r'http://images.icnetwork.co.uk/upl/birm')}) cov = str(cov) cov2 = re.findall( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov) diff --git a/recipes/standardmedia_ke.recipe b/recipes/standardmedia_ke.recipe index 8224a622a9..d86c4f3a99 100644 --- a/recipes/standardmedia_ke.recipe +++ b/recipes/standardmedia_ke.recipe @@ -44,7 +44,7 @@ class StandardMediaKeRecipe(BasicNewsRecipe): def print_version(self, url): import re p = re.compile(r'http://www.standardmedia.co.ke/.*InsidePage.php') - return p.sub('http://www.standardmedia.co.ke/print.php', url) + return p.sub(r'http://www.standardmedia.co.ke/print.php', url) def preprocess_html(self, soup): return self.adeify_images(soup) diff --git a/recipes/zeitde_sub.recipe b/recipes/zeitde_sub.recipe index f3b044fa32..d4301dac3b 100644 --- a/recipes/zeitde_sub.recipe +++ b/recipes/zeitde_sub.recipe @@ -199,12 +199,10 @@ class ZeitEPUBAbo(BasicNewsRecipe): # '.*E-Paper.*')) # used to be '.*Abo-Bereich.*' # browser.follow_link(abolink) # find page for latest issue - latestlink = browser.find_link(text_regex=re.compile( - r'.*ZUR AKTUELLEN AUSGABE.*')) + latestlink = browser.find_link(text_regex=re.compile(r'.*ZUR AKTUELLEN AUSGABE.*')) browser.follow_link(latestlink) # now find the correct file, we will still use the ePub file - epublink = browser.find_link(text_regex=re.compile( - r'.*EPUB F.*R E-READER LADEN.*')) # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017 + epublink = browser.find_link(text_regex=re.compile(r'.*EPUB F.*R E-READER LADEN.*')) # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017 response = browser.follow_link(epublink) self.report_progress(1, _('next step')) @@ -265,12 +263,10 @@ class ZeitEPUBAbo(BasicNewsRecipe): # '.*Abo-Bereich.*')) # browser.follow_link(abolink) # find page for latest issue - latestlink = browser.find_link(text_regex=re.compile( - r'.*ZUR AKTUELLEN AUSGABE.*')) + latestlink = browser.find_link(text_regex=re.compile(r'.*ZUR AKTUELLEN AUSGABE.*')) browser.follow_link(latestlink) # actual cover search - pdflink = browser.find_link(text_regex=re.compile( - r'.*GESAMT-PDF LADEN.*')) + pdflink = browser.find_link(text_regex=re.compile(r'.*GESAMT-PDF LADEN.*')) cover_url = urlparse(pdflink.base_url)[0] + '://' + urlparse(pdflink.base_url)[1] + '' + ( urlparse(pdflink.url)[2]).replace('ePaper_', '').replace('.pdf', '_001.pdf') self.log.warning('PDF link found:') diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 4cc9273d99..f45e84de48 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -228,10 +228,10 @@ per_language_title_sort_articles = { # Polish 'pol': (), # Italian - 'ita': ('Lo\\s+', 'Il\\s+', "L'", 'L´', 'La\\s+', 'Gli\\s+', - 'I\\s+', 'Le\\s+', 'Uno\\s+', 'Un\\s+', 'Una\\s+', "Un'", - 'Un´', 'Dei\\s+', 'Degli\\s+', 'Delle\\s+', 'Del\\s+', - 'Della\\s+', 'Dello\\s+', "Dell'", 'Dell´'), + 'ita': (r'Lo\s+', r'Il\s+', r"L'", r'L´', r'La\s+', r'Gli\s+', + r'I\s+', r'Le\s+', r'Uno\s+', r'Un\s+', r'Una\s+', "rUn'", + r'Un´', r'Dei\s+', r'Degli\s+', r'Delle\s+', r'Del\s+', + r'Della\s+', r'Dello\s+', r"Dell'", r'Dell´'), # Portuguese 'por': (r'A\s+', r'O\s+', r'Os\s+', r'As\s+', r'Um\s+', r'Uns\s+', r'Uma\s+', r'Umas\s+'), diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 6ee432132c..c5697bce13 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -256,10 +256,10 @@ def get_parsed_proxy(typ='http', debug=True): proxy = proxies.get(typ, None) if proxy: pattern = re.compile(( - '(?:ptype://)?' - '(?:(?P\\w+):(?P.*)@)?' - '(?P[\\w\\-\\.]+)' - '(?::(?P\\d+))?').replace('ptype', typ) + r'(?:ptype://)?' + r'(?:(?P\w+):(?P.*)@)?' + r'(?P[\w\-\.]+)' + r'(?::(?P\d+))?').replace('ptype', typ) ) match = pattern.match(proxies[typ]) diff --git a/src/calibre/db/fts/text.py b/src/calibre/db/fts/text.py index 3e8938140f..432d471704 100644 --- a/src/calibre/db/fts/text.py +++ b/src/calibre/db/fts/text.py @@ -43,7 +43,7 @@ def html_to_text(root): pat = re.compile(r'\n{3,}') for body in root.xpath('h:body', namespaces=XPNSMAP): body.tail = '' - yield pat.sub('\n\n', ''.join(tag_to_text(body)).strip()) + yield pat.sub(r'\n\n', ''.join(tag_to_text(body)).strip()) def to_text(container, name): diff --git a/src/calibre/ebooks/conversion/plugins/rtf_input.py b/src/calibre/ebooks/conversion/plugins/rtf_input.py index fdf0052022..adc3048cbc 100644 --- a/src/calibre/ebooks/conversion/plugins/rtf_input.py +++ b/src/calibre/ebooks/conversion/plugins/rtf_input.py @@ -298,8 +298,8 @@ class RTFInput(InputFormatPlugin): # clean multiple \n res = re.sub(br'\n+', b'\n', res) # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines - # res = re.sub('\s*', '', res) - # res = re.sub('(?<=\n)\n{2}', + # res = re.sub(br'\s*', '', res) + # res = re.sub(br'(?<=\n)\n{2}', # '

\u00a0

\n'.encode('utf-8'), res) f.write(res) self.write_inline_css(inline_class, border_styles) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f76c70b04b..91d21d1ccf 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -274,7 +274,7 @@ class Dehyphenator: r'\s*

\s*<[iub]>)\s*(?P[\w\d]+)')% length) elif format == 'txt': intextmatch = re.compile( - '(?<=.{%i})(?P[^\\W\\-]+)(-|‐)( |\t)*(?P(\n( |\t)*)+)(?P[\\w\\d]+)'% length) + r'(?<=.{%i})(?P[^\W\-]+)(-|‐)( |\t)*(?P(\n( |\t)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': intextmatch = re.compile( r'(?!<)(?P[^\W\-]+)(-|‐)\s*(?P\w+)(?![^<]*?>)', re.UNICODE) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index ac1fb1daf1..102749d4ad 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -35,11 +35,11 @@ class HeuristicProcessor: self.line_open = ( r'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*' r'(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*') - self.line_close = '()?\\s*()?\\s*()?\\s*' + self.line_close = r'()?\s*()?\s*()?\s*' self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*)', re.IGNORECASE) self.scene_break_open = '

' - self.common_in_text_endings = '["\'—’”,\\.!\\?\\…\\)„\\w]' - self.common_in_text_beginnings = '[\\w\'"“‘‛]' + self.common_in_text_endings = r'["\'—’”,\.!\?\…\)„\w]' + self.common_in_text_beginnings = r'[\w\'"“‘‛]' def is_pdftohtml(self, src): return "" in src[:1000] @@ -59,8 +59,8 @@ class HeuristicProcessor: else: delete_whitespace = re.compile(r'^\s*(?P.*?)\s*$') delete_quotes = re.compile(r'\'"') - txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g', html2text(chap))) - txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g', html2text(title))) + txt_chap = delete_quotes.sub('', delete_whitespace.sub(r'\g', html2text(chap))) + txt_title = delete_quotes.sub('', delete_whitespace.sub(r'\g', html2text(title))) self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log.debug('marked ' + str(self.html_preprocess_sections) + ' chapters & titles. - ' + str(chap) + ', ' + str(title)) @@ -214,30 +214,30 @@ class HeuristicProcessor: self.log.debug('found ' + str(self.html_preprocess_sections) + ' pre-existing headings') # Build the Regular Expressions in pieces - init_lookahead = '(?=<(p|div))' + init_lookahead = r'(?=<(p|div))' chapter_line_open = self.line_open title_line_open = (r'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?' - r'\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*') + r'\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*') chapter_header_open = r'(?P' title_header_open = r'(?P' - chapter_header_close = ')\\s*' - title_header_close = ')' + chapter_header_close = r')\s*' + title_header_close = r')' chapter_line_close = self.line_close - title_line_close = '(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)>)?\\s*</(?P=outer2)>' + title_line_close = r'(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>' is_pdftohtml = self.is_pdftohtml(html) if is_pdftohtml: - title_line_open = '<(?P<outer2>p)[^>]*>\\s*' - title_line_close = '\\s*</(?P=outer2)>' + title_line_open = r'<(?P<outer2>p)[^>]*>\s*' + title_line_close = r'\s*</(?P=outer2)>' if blanks_between_paragraphs: - blank_lines = '(\\s*<p[^>]*>\\s*</p>){0,2}\\s*' + blank_lines = r'(\s*<p[^>]*>\s*</p>){0,2}\s*' else: blank_lines = '' - opt_title_open = '(' - opt_title_close = ')?' - n_lookahead_open = '(?!\\s*' - n_lookahead_close = ')\\s*' + opt_title_open = r'(' + opt_title_close = r')?' + n_lookahead_open = r'(?!\s*' + n_lookahead_close = r')\s*' default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)" simple_title = r'(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)' @@ -369,10 +369,10 @@ class HeuristicProcessor: lookahead = '(?<=.{'+str(length)+r'}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýźâêîôûçąężłıãõñæøþðßěľščťžňďřůёђєіїјљњћўџѣа-я,:)\\IAß]|(?<!\&\w{4});))' em_en_lookahead = '(?<=.{'+str(length)+'}[\u2013\u2014])' soft_hyphen = '\xad' - line_ending = '\\s*(?P<style_close></(span|[iub])>)?\\s*(</(p|div)>)?' - blanklines = '\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*' - line_opening = '<(p|div)[^>]*>\\s*(?P<style_open><(span|[iub])[^>]*>)?\\s*' - txt_line_wrap = '((\u0020|\u0009)*\n){1,4}' + line_ending = r'\s*(?P<style_close></(span|[iub])>)?\s*(</(p|div)>)?' + blanklines = r'\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*' + line_opening = r'<(p|div)[^>]*>\s*(?P<style_open><(span|[iub])[^>]*>)?\s*' + txt_line_wrap = r'((\u0020|\u0009)*\n){1,4}' if format == 'txt': unwrap_regex = lookahead+txt_line_wrap @@ -418,12 +418,12 @@ class HeuristicProcessor: # TODO - find out if there are cases where there are more than one <pre> tag or # other types of unmarked html and handle them in some better fashion add_markup = re.compile(r'(?<!>)(\n)') - html = add_markup.sub('</p>\n<p>', html) + html = add_markup.sub(r'</p>\n<p>', html) return html def arrange_htm_line_endings(self, html): - html = re.sub(r'\s*</(?P<tag>p|div)>', '</'+'\\g<tag>'+'>\n', html) - html = re.sub(r'\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*', '\n<'+'\\g<tag>'+'\\g<style>'+'>', html) + html = re.sub(r'\s*</(?P<tag>p|div)>', r'</\g<tag>'+'>\n', html) + html = re.sub(r'\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*', r'\n<\g<tag>\g<style>'+'>', html) return html def fix_nbsp_indents(self, html): @@ -546,7 +546,7 @@ class HeuristicProcessor: def markup_whitespaces(match): blanks = match.group(0) - blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks) + blanks = self.blankreg.sub(r'\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks) return blanks html = blanks_n_nopunct.sub(markup_whitespaces, html) @@ -556,10 +556,10 @@ class HeuristicProcessor: return html def detect_soft_breaks(self, html): - line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')' + line = '(?P<initline>'+self.line_open+r'\s*(?P<init_content>.*?)'+self.line_close+')' line_two = '(?P<line_two>'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_open)+ \ - '\\s*(?P<line_two_content>.*?)'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_close)+')' - div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two + r'\s*(?P<line_two_content>.*?)'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_close)+')' + div_break_candidate_pattern = line+r'\s*<div[^>]*>\s*</div>\s*'+line_two div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE) def convert_div_softbreaks(match): @@ -575,16 +575,16 @@ class HeuristicProcessor: html = div_break_candidate.sub(convert_div_softbreaks, html) if not self.blanks_deleted and self.blanks_between_paragraphs: - html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html) + html = self.multi_blank.sub(r'\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html) else: - html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html) + html = self.blankreg.sub(r'\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html) return html def detect_scene_breaks(self, html): scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+ \ - '<))(?P<break>((?P<break_char>((?!\\s)\\W))\\s*(?P=break_char)?){1,10})\\s*'+self.line_close + r'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?){1,10})\s*'+self.line_close scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) - html = scene_breaks.sub(self.scene_break_open+'\\g<break>'+'</p>', html) + html = scene_breaks.sub(self.scene_break_open+r'\g<break></p>', html) return html def markup_user_break(self, replacement_break): @@ -768,11 +768,11 @@ class HeuristicProcessor: is_pdftohtml = self.is_pdftohtml(html) if is_pdftohtml: - self.line_open = '<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*' - self.line_close = '\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>' + self.line_open = r'<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*' + self.line_close = r'\s*(</[ibu][^>]*>\s*)?</(?P=outer)>' # ADE doesn't render <br />, change to empty paragraphs - # html = re.sub('<br[^>]*>', '<p>\u00a0</p>', html) + # html = re.sub(r'<br[^>]*>', '<p>\u00a0</p>', html) # Determine whether the document uses interleaved blank lines self.blanks_between_paragraphs = self.analyze_blanks(html) @@ -791,7 +791,7 @@ class HeuristicProcessor: if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): self.log.debug('deleting blank lines') self.blanks_deleted = True - html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html) + html = self.multi_blank.sub(r'\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html) html = self.blankreg.sub('', html) # Determine line ending type @@ -845,7 +845,7 @@ class HeuristicProcessor: # headings and titles, images, etc doubleheading = re.compile( r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) - html = doubleheading.sub('\\g<firsthead>'+'\n<h3'+'\\g<secondhead>'+'</h3>', html) + html = doubleheading.sub(r'\g<firsthead>\n<h3\g<secondhead></h3>', html) # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks, # style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks. @@ -876,5 +876,5 @@ class HeuristicProcessor: if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs so they render correctly - html = self.anyblank.sub('\n'+r'\g<openline>'+'\u00a0'+r'\g<closeline>', html) + html = self.anyblank.sub(r'\n\g<openline>\u00a0\g<closeline>', html) return html diff --git a/src/calibre/ebooks/lrf/html/convert_from.py b/src/calibre/ebooks/lrf/html/convert_from.py index b258b64527..04b7226647 100644 --- a/src/calibre/ebooks/lrf/html/convert_from.py +++ b/src/calibre/ebooks/lrf/html/convert_from.py @@ -108,7 +108,7 @@ class HTMLConverter: lambda match: '<a'+match.group(1)+'></a>'), # Strip comments from <style> tags. This is needed as # sometimes there are unterminated comments - (re.compile(r'<\s*style.*?>(.*?)<\/\s*style\s*>', re.DOTALL|re.IGNORECASE), + (re.compile(r'<\s*style.*?>(.*?)</\s*style\s*>', re.DOTALL|re.IGNORECASE), lambda match: match.group().replace('<!--', '').replace('-->', '')), # remove <p> tags from within <a href> tags (re.compile(r'<\s*a\s+[^<>]*href\s*=[^<>]*>(.*?)<\s*/\s*a\s*>', re.DOTALL|re.IGNORECASE), diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index b7697b3213..bb225a8e0b 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -724,8 +724,8 @@ class Worker(Thread): # Get details {{{ # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace - # desc = re.sub('\n+', '\n', desc) - # desc = re.sub(' +', ' ', desc) + # desc = re.sub(r'\n+', '\n', desc) + # desc = re.sub(r' +', ' ', desc) # Remove the notice about text referring to out of print editions desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) # Remove comments diff --git a/src/calibre/ebooks/metadata/sources/edelweiss.py b/src/calibre/ebooks/metadata/sources/edelweiss.py index 35d5d370a6..55c1d3baf5 100644 --- a/src/calibre/ebooks/metadata/sources/edelweiss.py +++ b/src/calibre/ebooks/metadata/sources/edelweiss.py @@ -120,8 +120,8 @@ class Worker(Thread): # {{{ # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace - # desc = re.sub('\n+', '\n', desc) - # desc = re.sub(' +', ' ', desc) + # desc = re.sub(r'\n+', '\n', desc) + # desc = re.sub(r' +', ' ', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) return sanitize_comments_html(desc) diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index f81fee5cc2..1bc3b351b7 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -368,17 +368,17 @@ class MobiReader: self.processed_html = self.processed_html.replace('> <', '>\n<') self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:') self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html) - self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html) + self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', '', self.processed_html) # Swap inline and block level elements, and order block level elements according to priority # - lxml and beautifulsoup expect/assume a specific order based on xhtml spec self.processed_html = re.sub( - r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>'+r'\g<styletags>', self.processed_html) + r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>\g<styletags>', self.processed_html) self.processed_html = re.sub( - r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>'+r'\g<para>', self.processed_html) + r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>\g<para>', self.processed_html) self.processed_html = re.sub( - r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>'+r'\g<blockquote>', self.processed_html) + r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>\g<blockquote>', self.processed_html) self.processed_html = re.sub( - r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html) + r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>\g<para>', self.processed_html) bods = htmls = 0 for x in re.finditer(r'</body>|</html>', self.processed_html): if x == '</body>': diff --git a/src/calibre/ebooks/readability/readability.py b/src/calibre/ebooks/readability/readability.py index 38760af55d..cad2a50c42 100644 --- a/src/calibre/ebooks/readability/readability.py +++ b/src/calibre/ebooks/readability/readability.py @@ -24,12 +24,12 @@ REGEXES = { 'positiveRe': re.compile(r'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I), 'negativeRe': re.compile(r'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I), # noqa: E501 'divToPElementsRe': re.compile(r'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I), - # 'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I), - # 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), - # 'trimRe': re.compile('^\s+|\s+$/'), - # 'normalizeRe': re.compile('\s{2,}/'), - # 'killBreaksRe': re.compile('(<br\s*\/?>(\s| ?)*){1,}/'), - # 'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), + # 'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I), + # 'replaceFontsRe': re.compile(r'<(/?)font[^>]*>',re.I), + # 'trimRe': re.compile(r'^\s+|\s+$/'), + # 'normalizeRe': re.compile(r'\s{2,}/'), + # 'killBreaksRe': re.compile(r'(<br\s*/?>(\s| ?)*){1,}/'), + # 'videoRe': re.compile(r'http://(www\.)?(youtube|vimeo)\.com', re.I), # skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, } diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index 73e872d046..a79fa4bd73 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -195,7 +195,7 @@ class RTFMLizer: def clean_text(self, text): # Remove excessive newlines - text = re.sub('%s{3,}' % os.linesep, f'{os.linesep}{os.linesep}', text) + text = re.sub(r'%s{3,}' % os.linesep, f'{os.linesep}{os.linesep}', text) # Remove excessive spaces text = re.sub(r'[ ]{2,}', ' ', text) diff --git a/src/calibre/ebooks/snb/snbml.py b/src/calibre/ebooks/snb/snbml.py index 43534bddb3..81faa5b02d 100644 --- a/src/calibre/ebooks/snb/snbml.py +++ b/src/calibre/ebooks/snb/snbml.py @@ -156,10 +156,10 @@ class SNBMLizer: text = text.replace('\f+', ' ') # Single line paragraph. - text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text) + text = re.sub(r'(?<=.)%s(?=.)' % os.linesep, ' ', text) # Remove multiple spaces. - # text = re.sub('[ ]{2,}', ' ', text) + # text = re.sub(r'[ ]{2,}', ' ', text) # Remove excessive newlines. text = re.sub(r'\n[ ]+\n', '\n\n', text) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index f83877cfa5..400a1c8dad 100644 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -760,7 +760,7 @@ class Textile: ''' what is this for? ''' - pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http(?:s?):\/\/|\/)\S+)(?=\s|$)', re.U) + pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http(?:s?)://|/)\S+)(?=\s|$)', re.U) text = pattern.sub(self.refs, text) return text diff --git a/src/calibre/ebooks/textile/unsmarten.py b/src/calibre/ebooks/textile/unsmarten.py index c140da9894..1c94906b50 100644 --- a/src/calibre/ebooks/textile/unsmarten.py +++ b/src/calibre/ebooks/textile/unsmarten.py @@ -119,8 +119,8 @@ def unsmarten(txt): txt = re.sub(r'♦|♦|♦', r'{diamond}', txt) # diamond # Move into main code? - # txt = re.sub('\xa0', r'p. ', txt) # blank paragraph - # txt = re.sub('\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph - # txt = re.sub('\n \n', r'\n<br />\n', txt) # blank paragraph - br tag + # txt = re.sub(r'\xa0', r'p. ', txt) # blank paragraph + # txt = re.sub(r'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph + # txt = re.sub(r'\n \n', r'\n<br />\n', txt) # blank paragraph - br tag return txt diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 5af1171685..7012fa47c9 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -222,7 +222,7 @@ def remove_indents(txt): ''' Remove whitespace at the beginning of each line. ''' - return re.sub(r'^[\r\t\f\v ]+', r'', txt, flags=re.MULTILINE) + return re.sub(r'^[\r\t\f\v ]+', '', txt, flags=re.MULTILINE) def opf_writer(path, opf_name, manifest, spine, mi): diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index b795f0116e..e317b37020 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -83,7 +83,7 @@ class TextileMLizer(OEB2HTML): text = re.sub(r'"(.+)":'+i+r'(\s)', r'\1\2', text) for i in self.our_ids: if i not in self.our_links: - text = re.sub(r'%?\('+i+'\\)\xa0?%?', r'', text) + text = re.sub(r'%?\('+i+'\\)\xa0?%?', '', text) # Remove obvious non-needed escaping, add sub/sup-script ones text = check_escaping(text, [r'\*', '_', r'\*']) @@ -101,7 +101,7 @@ class TextileMLizer(OEB2HTML): # remove spaces before a newline text = re.sub(r' +\n', r'\n', text) # remove newlines at top of file - text = re.sub(r'^\n+', r'', text) + text = re.sub(r'^\n+', '', text) # correct blockcode paras text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) # correct blockquote paras diff --git a/src/calibre/gui2/store/search/models.py b/src/calibre/gui2/store/search/models.py index 9ddd0a2fc9..ca8816cb4e 100644 --- a/src/calibre/gui2/store/search/models.py +++ b/src/calibre/gui2/store/search/models.py @@ -31,7 +31,7 @@ def comparable_price(text): m = re.sub(r'[.,\' ]', '.', match.group()) # remove all separators accept fraction, # leave only 2 digits in fraction - m = re.sub(r'\.(?!\d*$)', r'', m) + m = re.sub(r'\.(?!\d*$)', '', m) text = f'{float(m) * 100.:0>8.0f}' return text diff --git a/src/calibre/gui2/store/search/search.py b/src/calibre/gui2/store/search/search.py index 55bdb2f5e8..93a1c8dae9 100644 --- a/src/calibre/gui2/store/search/search.py +++ b/src/calibre/gui2/store/search/search.py @@ -248,7 +248,7 @@ class SearchDialog(QDialog, Ui_Dialog): query = re.sub(r'%s:"[^"]"' % loc, '', query) query = re.sub(r'%s:[^\s]*' % loc, '', query) # Remove logic. - query = re.sub(r'(^|\s|")(and|not|or|a|the|is|of)(\s|$|")', r' ', query) + query = re.sub(r'(^|\s|")(and|not|or|a|the|is|of)(\s|$|")', ' ', query) # Remove " query = query.replace('"', '') # Remove excess whitespace. diff --git a/src/calibre/gui2/store/stores/empik_plugin.py b/src/calibre/gui2/store/stores/empik_plugin.py index a75c0c0e2e..153b8af529 100644 --- a/src/calibre/gui2/store/stores/empik_plugin.py +++ b/src/calibre/gui2/store/stores/empik_plugin.py @@ -80,7 +80,7 @@ class EmpikStore(BasicStoreConfig, StorePlugin): # with closing(br.open('https://empik.com' + id.strip(), timeout=timeout/4)) as nf: # idata = html.fromstring(nf.read()) # crawled = idata.xpath('.//a[(@class="chosen hrefstyle") or (@class="connectionsLink hrefstyle")]/text()') - # formats = ','.join([re.sub('ebook, ','', x.strip()) for x in crawled if 'ebook' in x]) + # formats = ','.join([re.sub(r'ebook, ','', x.strip()) for x in crawled if 'ebook' in x]) counter -= 1 diff --git a/src/calibre/library/catalogs/csv_xml.py b/src/calibre/library/catalogs/csv_xml.py index 2f629a594e..c874c8d7d6 100644 --- a/src/calibre/library/catalogs/csv_xml.py +++ b/src/calibre/library/catalogs/csv_xml.py @@ -160,7 +160,7 @@ class CSV_XML(CatalogPlugin): if isinstance(item, str): opening_tag = re.search(r'<(\w+)( |>)', item) if opening_tag: - closing_tag = re.search(r'<\/%s>$' % opening_tag.group(1), item) + closing_tag = re.search(r'</%s>$' % opening_tag.group(1), item) if closing_tag: item = html2text(item) diff --git a/src/calibre/srv/content.py b/src/calibre/srv/content.py index 9d6c32ff40..761361f9a8 100644 --- a/src/calibre/srv/content.py +++ b/src/calibre/srv/content.py @@ -514,9 +514,9 @@ def set_note(ctx, rd, field, item_id, library_id): db_replacements[key] = f'{RESOURCE_URL_SCHEME}://{scheme}/{digest}' db_html = srv_html = html if db_replacements: - db_html = re.sub('|'.join(map(re.escape, db_replacements)), lambda m: db_replacements[m.group()], html) + db_html = re.sub(r'|'.join(map(re.escape, db_replacements)), lambda m: db_replacements[m.group()], html) if srv_replacements: - srv_html = re.sub('|'.join(map(re.escape, srv_replacements)), lambda m: srv_replacements[m.group()], html) + srv_html = re.sub(r'|'.join(map(re.escape, srv_replacements)), lambda m: srv_replacements[m.group()], html) db.set_notes_for(field, item_id, db_html, searchable_text, resources) rd.outheaders['Content-Type'] = 'text/html; charset=UTF-8' return srv_html diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 518040f610..86ba934c50 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -124,7 +124,7 @@ def default_is_link_wanted(url, tag): class RecursiveFetcher: LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in - ('.exe\\s*$', '.mp3\\s*$', '.ogg\\s*$', '^\\s*mailto:', '^\\s*$')) + (r'.exe\s*$', r'.mp3\s*$', r'.ogg\s*$', r'^\s*mailto:', r'^\s*$')) # ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in # ( #