From ac6912565aa5782bedaf9693ef12a4232a7be885 Mon Sep 17 00:00:00 2001 From: un-pogaz <46523284+un-pogaz@users.noreply.github.com> Date: Fri, 24 Jan 2025 11:14:20 +0100 Subject: [PATCH] always use raw-string for regex (auto-fix) ruff 'RUF039' --- imgsrc/srv/generate.py | 2 +- recipes/alejakomiksu_com.recipe | 2 +- recipes/bild_de.recipe | 6 +- recipes/birmingham_evening_mail.recipe | 2 +- recipes/calgary_herald.recipe | 4 +- recipes/chr_mon.recipe | 10 +- recipes/chronicle_higher_ed.recipe | 2 +- recipes/courrierinternational.recipe | 2 +- recipes/de_standaard.recipe | 4 +- recipes/deredactie.recipe | 6 +- recipes/donga.recipe | 2 +- recipes/dwutygodnik.recipe | 2 +- recipes/dziennik_pl.recipe | 4 +- recipes/edmonton_journal.recipe | 4 +- recipes/esenja.recipe | 4 +- recipes/esensja_(rss).recipe | 2 +- recipes/folhadesaopaulo_sub.recipe | 2 +- recipes/galaxys_edge.recipe | 2 +- recipes/gosc_full.recipe | 2 +- recipes/gosc_niedzielny.recipe | 2 +- recipes/gva_be.recipe | 4 +- recipes/hackernews.recipe | 2 +- recipes/handelsblatt.recipe | 16 +- recipes/history_today.recipe | 2 +- recipes/india_today.recipe | 2 +- recipes/joop.recipe | 2 +- recipes/kurier.recipe | 10 +- recipes/kyungyhang.recipe | 2 +- recipes/le_monde_sub_paper.recipe | 2 +- recipes/lwn_weekly.recipe | 2 +- recipes/montreal_gazette.recipe | 4 +- recipes/newsweek_polska.recipe | 6 +- recipes/nikkei_news.recipe | 4 +- recipes/nrc_next.recipe | 2 +- recipes/ottawa_citizen.recipe | 4 +- recipes/outlook_business_magazine.recipe | 2 +- recipes/polter_pl.recipe | 4 +- recipes/private_eye.recipe | 6 +- recipes/science_news.recipe | 2 +- recipes/sol_haber.recipe | 4 +- recipes/standardmedia_ke.recipe | 2 +- recipes/the_age.recipe | 2 +- recipes/theoldie.recipe | 2 +- recipes/tweakers_net.recipe | 2 +- recipes/vancouver_province.recipe | 4 +- recipes/vancouver_sun.recipe | 4 +- recipes/vic_times.recipe | 36 +-- recipes/zeitde.recipe | 2 +- recipes/zeitde_sub.recipe | 8 +- ruff-strict-pep8.toml | 3 +- setup/__init__.py | 2 +- src/calibre/db/search.py | 2 +- src/calibre/devices/cybook/driver.py | 4 +- src/calibre/devices/utils.py | 4 +- src/calibre/ebooks/__init__.py | 2 +- src/calibre/ebooks/chm/metadata.py | 4 +- src/calibre/ebooks/conversion/preprocess.py | 12 +- src/calibre/ebooks/conversion/utils.py | 28 +-- src/calibre/ebooks/html/input.py | 2 +- src/calibre/ebooks/htmlz/oeb2html.py | 2 +- src/calibre/ebooks/hyphenate.py | 4 +- src/calibre/ebooks/lrf/html/convert_from.py | 18 +- src/calibre/ebooks/lrf/html/table.py | 2 +- src/calibre/ebooks/metadata/__init__.py | 2 +- src/calibre/ebooks/metadata/meta.py | 2 +- src/calibre/ebooks/metadata/pdb.py | 2 +- src/calibre/ebooks/metadata/sources/amazon.py | 2 +- .../ebooks/metadata/sources/search_engines.py | 2 +- src/calibre/ebooks/mobi/reader/mobi6.py | 2 +- src/calibre/ebooks/oeb/polish/main.py | 2 +- src/calibre/ebooks/oeb/polish/spell.py | 2 +- .../ebooks/oeb/transforms/rasterize.py | 2 +- src/calibre/ebooks/pdf/pdftohtml.py | 4 +- src/calibre/ebooks/pdf/render/fonts.py | 2 +- src/calibre/ebooks/pml/pmlml.py | 8 +- src/calibre/ebooks/readability/readability.py | 10 +- src/calibre/ebooks/rtf/rtfml.py | 4 +- src/calibre/ebooks/rtf2xml/process_tokens.py | 2 +- src/calibre/ebooks/snb/snbml.py | 6 +- src/calibre/ebooks/textile/unsmarten.py | 214 +++++++++--------- src/calibre/ebooks/txt/markdownml.py | 6 +- src/calibre/ebooks/txt/processor.py | 10 +- src/calibre/ebooks/txt/txtml.py | 6 +- src/calibre/gui2/dialogs/search.py | 2 +- .../gui2/preferences/create_custom_column.py | 2 +- src/calibre/gui2/preferences/emailp.py | 2 +- .../gui2/tweak_book/editor/smarts/html.py | 2 +- .../library/catalogs/epub_mobi_builder.py | 6 +- src/calibre/library/catalogs/utils.py | 6 +- src/calibre/library/database.py | 4 +- src/calibre/utils/bibtex.py | 2 +- src/calibre/utils/complete.py | 2 +- src/calibre/utils/date.py | 4 +- src/calibre/utils/localization.py | 2 +- src/calibre/utils/search_query_parser.py | 2 +- src/calibre/web/feeds/news.py | 2 +- src/odf/easyliststyle.py | 2 +- 97 files changed, 315 insertions(+), 314 deletions(-) diff --git a/imgsrc/srv/generate.py b/imgsrc/srv/generate.py index 2eaae17ac2..f2600811ba 100644 --- a/imgsrc/srv/generate.py +++ b/imgsrc/srv/generate.py @@ -49,7 +49,7 @@ def merge(): clone_node(child, symbol) ans.append(symbol) ans = etree.tostring(ans, encoding='unicode', pretty_print=True, with_tail=False) - ans = re.sub(']+>', '', ans, count=1) + ans = re.sub(r']+>', '', ans, count=1) return ans diff --git a/recipes/alejakomiksu_com.recipe b/recipes/alejakomiksu_com.recipe index 2f22bbffaf..c08a5ada5f 100644 --- a/recipes/alejakomiksu_com.recipe +++ b/recipes/alejakomiksu_com.recipe @@ -29,6 +29,6 @@ class AlejaKomiksu(BasicNewsRecipe): def skip_ad_pages(self, soup): tag = soup.find(attrs={'class': 'rodzaj'}) if tag and tag.a.string.lower().strip() == 'recenzje': - link = soup.find(text=re.compile('recenzuje')) + link = soup.find(text=re.compile(r'recenzuje')) if link: return self.index_to_soup(link.parent['href'], raw=True) diff --git a/recipes/bild_de.recipe b/recipes/bild_de.recipe index a4150f80fc..2801ed376b 100644 --- a/recipes/bild_de.recipe +++ b/recipes/bild_de.recipe @@ -63,12 +63,12 @@ class AdvancedUserRecipe1303841067(BasicNewsRecipe): dict( attrs={'class': ['socialbar', 'social-sharing flank', 'vel', 'back']}), dict(name='img', attrs={'alt': 'logo'}), - dict(name='div', attrs={'class': re.compile('infoEl')}), - dict(name='span', attrs={'class': re.compile('loupe')}) + dict(name='div', attrs={'class': re.compile(r'infoEl')}), + dict(name='span', attrs={'class': re.compile(r'loupe')}) ] remove_tags_after = [ - dict(name='div', attrs={'itemprop': re.compile('articleBody')}) + dict(name='div', attrs={'itemprop': re.compile(r'articleBody')}) ] def preprocess_html(self, soup): diff --git a/recipes/birmingham_evening_mail.recipe b/recipes/birmingham_evening_mail.recipe index 8217f13d73..3f4d858d54 100644 --- a/recipes/birmingham_evening_mail.recipe +++ b/recipes/birmingham_evening_mail.recipe @@ -58,7 +58,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): def get_cover_url(self): soup = self.index_to_soup('http://www.birminghammail.co.uk') cov = soup.find(attrs={'src': re.compile( - 'http://images.icnetwork.co.uk/upl/birm')}) + r'http://images.icnetwork.co.uk/upl/birm')}) cov = str(cov) cov2 = re.findall( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov) diff --git a/recipes/calgary_herald.recipe b/recipes/calgary_herald.recipe index 981558e25d..062e1b2531 100644 --- a/recipes/calgary_herald.recipe +++ b/recipes/calgary_herald.recipe @@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe): .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } #photocredit { font-size: xx-small; font-weight: normal; }''' - keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})] + keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})] remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'}, dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict( @@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe): name='div', attrs={'class': 'copyright'}), dict(name='div', attrs={'class': 'rule_grey_solid'}), dict(name='div', attrs={'id': 'soundoff'}), - dict(name='div', attrs={'id': re.compile('flyer')}), + dict(name='div', attrs={'id': re.compile(r'flyer')}), dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})] def get_cover_url(self): diff --git a/recipes/chr_mon.recipe b/recipes/chr_mon.recipe index 02a91ddc1f..c75fc5fd4c 100644 --- a/recipes/chr_mon.recipe +++ b/recipes/chr_mon.recipe @@ -39,12 +39,12 @@ class CSMonitor(BasicNewsRecipe): } remove_tags = [ - dict(name=['meta', 'link', 'iframe', 'object', 'embed']), dict(attrs={'class': re.compile('(^|| )podStoryRel($|| )', re.DOTALL)}), dict( + dict(name=['meta', 'link', 'iframe', 'object', 'embed']), dict(attrs={'class': re.compile(r'(^|| )podStoryRel($|| )', re.DOTALL)}), dict( attrs={'class': ['bottom-rel', 'hide']}), dict(attrs={'id': ['pgallerycarousel_enlarge', 'pgallerycarousel_related']}) ] keep_only_tags = [ dict(name='h1', attrs={'class': 'head'}), dict(name='h2', attrs={'class': 'subhead'}), dict(attrs={'class': [ - 'sByline', 'thePhoto', 'ui-body-header']}), dict(attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)}) + 'sByline', 'thePhoto', 'ui-body-header']}), dict(attrs={'class': re.compile(r'(^|| )sBody($|| )', re.DOTALL)}) ] remove_attributes = ['xmlns:fb'] @@ -74,11 +74,11 @@ class CSMonitor(BasicNewsRecipe): nurl = 'http://www.csmonitor.com' + nexttag['href'] soup2 = self.index_to_soup(nurl) texttag = soup2.find( - attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)}) + attrs={'class': re.compile(r'(^|| )sBody($|| )', re.DOTALL)}) if texttag: appendtag = soup.find( - attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)}) - for citem in texttag.findAll(attrs={'class': [re.compile('(^|| )podStoryRel($|| )', re.DOTALL), 'bottom-rel', 'hide']}): + attrs={'class': re.compile(r'(^|| )sBody($|| )', re.DOTALL)}) + for citem in texttag.findAll(attrs={'class': [re.compile(r'(^|| )podStoryRel($|| )', re.DOTALL), 'bottom-rel', 'hide']}): citem.extract() self.append_page(soup2) texttag.extract() diff --git a/recipes/chronicle_higher_ed.recipe b/recipes/chronicle_higher_ed.recipe index 795aac4276..c61c54686b 100644 --- a/recipes/chronicle_higher_ed.recipe +++ b/recipes/chronicle_higher_ed.recipe @@ -47,7 +47,7 @@ class Chronicle(BasicNewsRecipe): # Find cover cover = soup0.find('div', attrs={ - 'class': 'side-content'}).find(attrs={'src': re.compile('photos/biz/Current')}) + 'class': 'side-content'}).find(attrs={'src': re.compile(r'photos/biz/Current')}) if cover is not None: if 'chronicle.com' in cover['src']: self.cover_url = cover['src'] diff --git a/recipes/courrierinternational.recipe b/recipes/courrierinternational.recipe index 193783171b..891901ead0 100644 --- a/recipes/courrierinternational.recipe +++ b/recipes/courrierinternational.recipe @@ -86,7 +86,7 @@ class CourrierInternational(BasicNewsRecipe): return br def preprocess_html(self, soup): - for link in soup.findAll('a', href=re.compile('^/')): + for link in soup.findAll('a', href=re.compile(r'^/')): link['href'] = 'http://www.courrierinternational.com' + link['href'] return soup diff --git a/recipes/de_standaard.recipe b/recipes/de_standaard.recipe index 4bb2222672..e374a22f44 100644 --- a/recipes/de_standaard.recipe +++ b/recipes/de_standaard.recipe @@ -71,10 +71,10 @@ class AdvancedUserRecipe1467571059(BasicNewsRecipe): remove_tags = [ dict(name=['embed', 'object']), dict(name='div', attrs={'class':['note NotePortrait', 'note']}), - dict(name='ul', attrs={'class':re.compile('article__share')}), + dict(name='ul', attrs={'class':re.compile(r'article__share')}), dict(name='div', attrs={'class':'slideshow__controls'}), dict(name='a', attrs={'role':'button'}), - dict(name='figure', attrs={'class':re.compile('video')}) + dict(name='figure', attrs={'class':re.compile(r'video')}) ] remove_attributes = ['width', 'height'] diff --git a/recipes/deredactie.recipe b/recipes/deredactie.recipe index 8fe8229a29..eb9dd676e6 100644 --- a/recipes/deredactie.recipe +++ b/recipes/deredactie.recipe @@ -31,9 +31,9 @@ class deredactie(BasicNewsRecipe): catnames = {} soup = self.index_to_soup( 'http://www.deredactie.be/cm/vrtnieuws.deutsch') - for elem in soup.findAll('li', attrs={'id': re.compile('^navItem[2-9]')}): + for elem in soup.findAll('li', attrs={'id': re.compile(r'^navItem[2-9]')}): a = elem.find('a', href=True) - m = re.search('(?<=/)[^/]*$', a['href']) + m = re.search(r'(?<=/)[^/]*$', a['href']) cat = str(m.group(0)) categories.append(cat) catnames[cat] = a['title'] @@ -45,7 +45,7 @@ class deredactie(BasicNewsRecipe): articles = [] soup = self.index_to_soup( 'http://www.deredactie.be/cm/vrtnieuws.deutsch/' + cat) - for a in soup.findAll('a', attrs={'href': re.compile('deutsch.*/[0-9][0-9][0-9][0-9][0-9][0-9]_')}): + for a in soup.findAll('a', attrs={'href': re.compile(r'deutsch.*/[0-9][0-9][0-9][0-9][0-9][0-9]_')}): skip_this_article = False url = a['href'].strip() if url.startswith('/'): diff --git a/recipes/donga.recipe b/recipes/donga.recipe index 95fb83f990..ddefdf61e3 100644 --- a/recipes/donga.recipe +++ b/recipes/donga.recipe @@ -51,7 +51,7 @@ class Donga(BasicNewsRecipe): # https://www.donga.com/news/[sections]/article/all/[date]/[gid]/1 # Return print version url with syntax: # https://www.donga.com/news/View?gid=[gid]&date=[date] - reobject = re.search('(?<=/all/)([0-9]*)/([0-9]*)', url) + reobject = re.search(r'(?<=/all/)([0-9]*)/([0-9]*)', url) date = reobject.group(1) gid = reobject.group(2) diff --git a/recipes/dwutygodnik.recipe b/recipes/dwutygodnik.recipe index 574acde94d..0c5f0bf9b5 100644 --- a/recipes/dwutygodnik.recipe +++ b/recipes/dwutygodnik.recipe @@ -33,7 +33,7 @@ class dwutygodnik(BasicNewsRecipe): browser.open('http://www.dwutygodnik.com/') # find the link - epublink = browser.find_link(text_regex=re.compile('Wydanie EPUB')) + epublink = browser.find_link(text_regex=re.compile(r'Wydanie EPUB')) # download ebook self.report_progress(0, _('Downloading ePUB')) diff --git a/recipes/dziennik_pl.recipe b/recipes/dziennik_pl.recipe index e0d88ca28e..58bceab229 100644 --- a/recipes/dziennik_pl.recipe +++ b/recipes/dziennik_pl.recipe @@ -21,8 +21,8 @@ class Dziennik_pl(BasicNewsRecipe): remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} extra_css = 'ul {list-style: none; padding: 0; margin: 0;} .foto {float: left;} .clr {clear: both;}' - preprocess_regexps = [(re.compile('Komentarze:'), lambda m: ''), (re.compile( - '

>>> CZYTAJ TAKŻE: ".*?"

'), lambda m: '')] + preprocess_regexps = [(re.compile(r'Komentarze:'), lambda m: ''), (re.compile( + r'

>>> CZYTAJ TAKŻE: ".*?"

'), lambda m: '')] keep_only_tags = [dict(id='article')] remove_tags = [dict(name='div', attrs={'class': ['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class': ['komentarz', 'article_icon_addcommnent']}), dict(name='ins'), dict(name='br')] # noqa: E501 feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'), diff --git a/recipes/edmonton_journal.recipe b/recipes/edmonton_journal.recipe index 5800a552d5..9ea93456f1 100644 --- a/recipes/edmonton_journal.recipe +++ b/recipes/edmonton_journal.recipe @@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe): .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } #photocredit { font-size: xx-small; font-weight: normal; }''' - keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})] + keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})] remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'}, dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict( @@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe): name='div', attrs={'class': 'copyright'}), dict(name='div', attrs={'class': 'rule_grey_solid'}), dict(name='div', attrs={'id': 'soundoff'}), - dict(name='div', attrs={'id': re.compile('flyer')}), + dict(name='div', attrs={'id': re.compile(r'flyer')}), dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})] def get_cover_url(self): diff --git a/recipes/esenja.recipe b/recipes/esenja.recipe index c17537af9c..42f43f7192 100644 --- a/recipes/esenja.recipe +++ b/recipes/esenja.recipe @@ -51,7 +51,7 @@ class Esensja(BasicNewsRecipe): def parse_index(self): soup = self.index_to_soup('http://www.esensja.pl/magazyn/') - a = soup.find('a', attrs={'href': re.compile('.*/index.html')}) + a = soup.find('a', attrs={'href': re.compile(r'.*/index.html')}) year = a['href'].split('/')[0] month = a['href'].split('/')[1] self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/' @@ -149,7 +149,7 @@ class Esensja(BasicNewsRecipe): info = tag.find(attrs={'class': 'img_info'}) text = str(tag) if not src: - src = re.search('src="[^"]*?"', text) + src = re.search(r'src="[^"]*?"', text) if src: src = src.group(0) src = src[5:].replace('//', '/') diff --git a/recipes/esensja_(rss).recipe b/recipes/esensja_(rss).recipe index f91607702d..6ce8b697e1 100644 --- a/recipes/esensja_(rss).recipe +++ b/recipes/esensja_(rss).recipe @@ -95,7 +95,7 @@ class EsensjaRSS(BasicNewsRecipe): info = tag.find(attrs={'class': 'img_info'}) text = str(tag) if not src: - src = re.search('src="[^"]*?"', text) + src = re.search(r'src="[^"]*?"', text) if src: src = src.group(0) src = src[5:].replace('//', '/') diff --git a/recipes/folhadesaopaulo_sub.recipe b/recipes/folhadesaopaulo_sub.recipe index e9388030a2..c3a4ac2cf6 100644 --- a/recipes/folhadesaopaulo_sub.recipe +++ b/recipes/folhadesaopaulo_sub.recipe @@ -109,7 +109,7 @@ img { background: none !important; float: none; margin: 0px; } for post in soup.findAll('a'): strpost = str(post) - if re.match('
", + (re.compile(r"
", re.DOTALL | re.IGNORECASE), lambda match: ''), ] diff --git a/recipes/le_monde_sub_paper.recipe b/recipes/le_monde_sub_paper.recipe index 46b366a4d0..e2651ab6d0 100644 --- a/recipes/le_monde_sub_paper.recipe +++ b/recipes/le_monde_sub_paper.recipe @@ -121,7 +121,7 @@ class LeMondeAbonne(BasicNewsRecipe): files = os.listdir(path) nb_index_files = len([ - name for name in files if re.match('frame_gauche_[0-9]+.html', name) + name for name in files if re.match(r'frame_gauche_[0-9]+.html', name) ]) flux = [] diff --git a/recipes/lwn_weekly.recipe b/recipes/lwn_weekly.recipe index 6867f22ccb..162078db8b 100644 --- a/recipes/lwn_weekly.recipe +++ b/recipes/lwn_weekly.recipe @@ -144,7 +144,7 @@ class WeeklyLWN(BasicNewsRecipe): # Most articles have anchors in their titles, *except* the # security vulnerabilities article_anchor = curr.find( - name='a', attrs={'href': re.compile('^/Articles/')}) + name='a', attrs={'href': re.compile(r'^/Articles/')}) if article_anchor: article_url = article_anchor.get('href') diff --git a/recipes/montreal_gazette.recipe b/recipes/montreal_gazette.recipe index 6d8a65cb03..20eca5fda4 100644 --- a/recipes/montreal_gazette.recipe +++ b/recipes/montreal_gazette.recipe @@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe): .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } #photocredit { font-size: xx-small; font-weight: normal; }''' - keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})] + keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})] remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'}, dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict( @@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe): name='div', attrs={'class': 'copyright'}), dict(name='div', attrs={'class': 'rule_grey_solid'}), dict(name='div', attrs={'id': 'soundoff'}), - dict(name='div', attrs={'id': re.compile('flyer')}), + dict(name='div', attrs={'id': re.compile(r'flyer')}), dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})] def get_cover_url(self): diff --git a/recipes/newsweek_polska.recipe b/recipes/newsweek_polska.recipe index ce34da87eb..4733dfe135 100644 --- a/recipes/newsweek_polska.recipe +++ b/recipes/newsweek_polska.recipe @@ -71,21 +71,21 @@ class Newsweek(BasicNewsRecipe): strong = p.find('strong') if strong: newest = re.compile( - 'Tekst pochodzi z najnowszego numeru Tygodnika Newsweek') + r'Tekst pochodzi z najnowszego numeru Tygodnika Newsweek') if newest.search(str(strong)): strong.extract() continue itunes = p.find('a') if itunes: - reurl = re.compile('itunes.apple.com') + reurl = re.compile(r'itunes.apple.com') if reurl.search(str(itunes['href'])): p.extract() continue imagedesc = p.find('div', attrs={'class': 'image-desc'}) if imagedesc: - redesc = re.compile('Okładka numeru') + redesc = re.compile(r'Okładka numeru') if (redesc.search(str(imagedesc))): p.extract() continue diff --git a/recipes/nikkei_news.recipe b/recipes/nikkei_news.recipe index bd493cf260..afc88e86ec 100644 --- a/recipes/nikkei_news.recipe +++ b/recipes/nikkei_news.recipe @@ -77,10 +77,10 @@ class NikkeiNet_paper_subscription(BasicNewsRecipe): print('-------------------------get index of paper--------------------------------') result = [] soup = self.index_to_soup('http://www.nikkei.com/paper/') - sections = soup.findAll(attrs={'class': re.compile('.*cmn-article_title.*')}) + sections = soup.findAll(attrs={'class': re.compile(r'.*cmn-article_title.*')}) for sect in sections: - sect_title = sect.find(attrs={'class' : re.compile('.*cmnc-((large)|(middle)|(small)).*')}) + sect_title = sect.find(attrs={'class' : re.compile(r'.*cmnc-((large)|(middle)|(small)).*')}) if sect_title is None: continue sect_title = sect_title.contents[0] diff --git a/recipes/nrc_next.recipe b/recipes/nrc_next.recipe index bc6e73e5d3..bcc5e4931c 100644 --- a/recipes/nrc_next.recipe +++ b/recipes/nrc_next.recipe @@ -62,7 +62,7 @@ class NRCNext(BasicNewsRecipe): zfile = zipfile.ZipFile(BytesIO(epubraw), 'r') zfile.extractall(self.output_dir) namelist = zfile.namelist() - emre = re.compile('<em(?:.*)>(.*)</em>') + emre = re.compile(r'<em(?:.*)>(.*)</em>') subst = '\\1' for name in namelist: _, ext = os.path.splitext(name) diff --git a/recipes/ottawa_citizen.recipe b/recipes/ottawa_citizen.recipe index 21af4a8d5e..a635c4b0a2 100644 --- a/recipes/ottawa_citizen.recipe +++ b/recipes/ottawa_citizen.recipe @@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe): .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } #photocredit { font-size: xx-small; font-weight: normal; }''' - keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})] + keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})] remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'}, dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict( @@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe): name='div', attrs={'class': 'copyright'}), dict(name='div', attrs={'class': 'rule_grey_solid'}), dict(name='div', attrs={'id': 'soundoff'}), - dict(name='div', attrs={'id': re.compile('flyer')}), + dict(name='div', attrs={'id': re.compile(r'flyer')}), dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})] def get_cover_url(self): diff --git a/recipes/outlook_business_magazine.recipe b/recipes/outlook_business_magazine.recipe index 1ab5abbaa4..50fe0b6ffd 100644 --- a/recipes/outlook_business_magazine.recipe +++ b/recipes/outlook_business_magazine.recipe @@ -48,7 +48,7 @@ class outlook(BasicNewsRecipe): return [('Articles', ans)] def preprocess_raw_html(self, raw, *a): - m = re.search('id="__NEXT_DATA__" type="application/json">', raw) + m = re.search(r'id="__NEXT_DATA__" type="application/json">', raw) raw = raw[m.start():] raw = raw.split('>', 1)[1] data = json.JSONDecoder().raw_decode(raw)[0] diff --git a/recipes/polter_pl.recipe b/recipes/polter_pl.recipe index 96e761087b..f5cf9e631e 100644 --- a/recipes/polter_pl.recipe +++ b/recipes/polter_pl.recipe @@ -41,9 +41,9 @@ class Polter(BasicNewsRecipe): (u'Konwenty', 'http://konwenty.polter.pl/wiesci,rss.html')] def preprocess_html(self, soup): - for s in soup.findAll(attrs={'style': re.compile('float: ?left')}): + for s in soup.findAll(attrs={'style': re.compile(r'float: ?left')}): s['class'] = 'floatleft' - for s in soup.findAll(attrs={'style': re.compile('float: ?right')}): + for s in soup.findAll(attrs={'style': re.compile(r'float: ?right')}): s['class'] = 'floatright' for s in soup.findAll(style=True): if 'bold;' in s['style']: diff --git a/recipes/private_eye.recipe b/recipes/private_eye.recipe index 4f0076e06b..03a03c3e18 100644 --- a/recipes/private_eye.recipe +++ b/recipes/private_eye.recipe @@ -161,9 +161,9 @@ class PrivateEyeRecipe(BasicNewsRecipe): {'name': 'div', 'attrs': {'id': 'about-covers'}}, {'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}}, {'name': 'iframe'}, - {'name': 'link', 'attrs': {'href': re.compile('/javastyle/lightbox/')}}, - {'name': 'link', 'attrs': {'href': re.compile('/javastyle/news_ticker/')}}, - {'name': 'link', 'attrs': {'href': re.compile('/javastyle/media-queries-')}}, + {'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/lightbox/')}}, + {'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/news_ticker/')}}, + {'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/media-queries-')}}, ] # Convert headers to h1, strapline to h4 diff --git a/recipes/science_news.recipe b/recipes/science_news.recipe index 50198a3d19..4be6289658 100644 --- a/recipes/science_news.recipe +++ b/recipes/science_news.recipe @@ -54,7 +54,7 @@ class ScienceNewsIssue(BasicNewsRecipe): # Get articles soup = self.index_to_soup(url) soup = soup.find('main', attrs={'id':'content'}) - re_article = re.compile('https://www.sciencenews.org/article/') + re_article = re.compile(r'https://www.sciencenews.org/article/') stories = [] past_urls = set() for sec in soup.find_all(href=re_article): diff --git a/recipes/sol_haber.recipe b/recipes/sol_haber.recipe index b2b1c9d421..ea9ee3dde8 100644 --- a/recipes/sol_haber.recipe +++ b/recipes/sol_haber.recipe @@ -76,8 +76,8 @@ class SolHaberRecipe(BasicNewsRecipe): result = [] articles_dict = {} - author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$') - category_regexp = re.compile('^http://.*?/(.+?)/.*$') + author_regexp = re.compile(r'^http://.*?/yazarlar/(.*?)/.*$') + category_regexp = re.compile(r'^http://.*?/(.+?)/.*$') for section_tuple in self.section_tuples: diff --git a/recipes/standardmedia_ke.recipe b/recipes/standardmedia_ke.recipe index e702e15a31..8224a622a9 100644 --- a/recipes/standardmedia_ke.recipe +++ b/recipes/standardmedia_ke.recipe @@ -43,7 +43,7 @@ class StandardMediaKeRecipe(BasicNewsRecipe): def print_version(self, url): import re - p = re.compile('http://www.standardmedia.co.ke/.*InsidePage.php') + p = re.compile(r'http://www.standardmedia.co.ke/.*InsidePage.php') return p.sub('http://www.standardmedia.co.ke/print.php', url) def preprocess_html(self, soup): diff --git a/recipes/the_age.recipe b/recipes/the_age.recipe index 662b7b61ff..871592bcdb 100644 --- a/recipes/the_age.recipe +++ b/recipes/the_age.recipe @@ -89,7 +89,7 @@ class TheAge(BasicNewsRecipe): for i in soup.findAll('a'): href = i['href'] - if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf', href): + if href and re.match(r'http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf', href): return href return None diff --git a/recipes/theoldie.recipe b/recipes/theoldie.recipe index 7882e1b31d..6c20d77310 100644 --- a/recipes/theoldie.recipe +++ b/recipes/theoldie.recipe @@ -92,7 +92,7 @@ class PrivateEyeRecipe(BasicNewsRecipe): # 1. Title. By author #.2. Title by author: subtitle # 3. Title: author: subtitle - title_author_re = re.compile('^(.*?)(?:(?: by )|(?:: ))(.*?): (.*?)$') + title_author_re = re.compile(r'^(.*?)(?:(?: by )|(?:: ))(.*?): (.*?)$') # Separate author from title (where it is specified) def title_author(self, head): diff --git a/recipes/tweakers_net.recipe b/recipes/tweakers_net.recipe index 13a78898b7..40ea3773e7 100644 --- a/recipes/tweakers_net.recipe +++ b/recipes/tweakers_net.recipe @@ -38,7 +38,7 @@ class Tweakers(BasicNewsRecipe): 'class': ['sidebar', 'advertorial'] }, { - 'class': re.compile('nextPrevious') + 'class': re.compile(r'nextPrevious') }, ] no_stylesheets = True diff --git a/recipes/vancouver_province.recipe b/recipes/vancouver_province.recipe index 7e2e354bc2..ef49d79e3e 100644 --- a/recipes/vancouver_province.recipe +++ b/recipes/vancouver_province.recipe @@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe): .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } #photocredit { font-size: xx-small; font-weight: normal; }''' - keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})] + keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})] remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'}, dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict( @@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe): name='div', attrs={'class': 'copyright'}), dict(name='div', attrs={'class': 'rule_grey_solid'}), dict(name='div', attrs={'id': 'soundoff'}), - dict(name='div', attrs={'id': re.compile('flyer')}), + dict(name='div', attrs={'id': re.compile(r'flyer')}), dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})] def get_cover_url(self): diff --git a/recipes/vancouver_sun.recipe b/recipes/vancouver_sun.recipe index 03016d6e25..5b8ab19b78 100644 --- a/recipes/vancouver_sun.recipe +++ b/recipes/vancouver_sun.recipe @@ -127,7 +127,7 @@ class CanWestPaper(BasicNewsRecipe): .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } #photocredit { font-size: xx-small; font-weight: normal; }''' - keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})] + keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})] remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'}, dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict( @@ -141,7 +141,7 @@ class CanWestPaper(BasicNewsRecipe): name='div', attrs={'class': 'copyright'}), dict(name='div', attrs={'class': 'rule_grey_solid'}), dict(name='div', attrs={'id': 'soundoff'}), - dict(name='div', attrs={'id': re.compile('flyer')}), + dict(name='div', attrs={'id': re.compile(r'flyer')}), dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})] def get_cover_url(self): diff --git a/recipes/vic_times.recipe b/recipes/vic_times.recipe index c4893c606b..7be3e60b17 100644 --- a/recipes/vic_times.recipe +++ b/recipes/vic_times.recipe @@ -82,28 +82,28 @@ class TimesColonist(BasicNewsRecipe): .caption { font-size: xx-small; font-style: italic; font-weight: normal; } ''' keep_only_tags = [ - dict(name='div', attrs={'class': re.compile('main.content')})] + dict(name='div', attrs={'class': re.compile(r'main.content')})] def __init__(self, options, log, progress_reporter): self.remove_tags = [{'class': 'comments'}, {'id': 'photocredit'}, dict(name='div', attrs={ - 'class': re.compile('top.controls')}), + 'class': re.compile(r'top.controls')}), dict(name='div', attrs={ - 'class': re.compile('^comments')}), + 'class': re.compile(r'^comments')}), dict(name='div', attrs={ - 'class': re.compile('social')}), + 'class': re.compile(r'social')}), dict(name='div', attrs={ - 'class': re.compile('tools')}), + 'class': re.compile(r'tools')}), dict(name='div', attrs={ - 'class': re.compile('bottom.tools')}), + 'class': re.compile(r'bottom.tools')}), dict(name='div', attrs={ - 'class': re.compile('window')}), - dict(name='div', attrs={'class': re.compile('related.news.element')})] + 'class': re.compile(r'window')}), + dict(name='div', attrs={'class': re.compile(r'related.news.element')})] print('PROFILE NAME = ' + options.output_profile.short_name) if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']: self.remove_tags.append( - dict(name='div', attrs={'class': re.compile('image-container')})) + dict(name='div', attrs={'class': re.compile(r'image-container')})) BasicNewsRecipe.__init__(self, options, log, progress_reporter) def get_cover_url(self): @@ -173,19 +173,19 @@ class TimesColonist(BasicNewsRecipe): return soup def preprocess_html(self, soup): - byline = soup.find('p', attrs={'class': re.compile('ancillary')}) + byline = soup.find('p', attrs={'class': re.compile(r'ancillary')}) if byline is not None: authstr = self.tag_to_string(byline, False) - authstr = re.sub('/ *Times Colonist', '/', + authstr = re.sub(r'/ *Times Colonist', '/', authstr, flags=re.IGNORECASE) - authstr = re.sub('BY */', '', authstr, flags=re.IGNORECASE) + authstr = re.sub(r'BY */', '', authstr, flags=re.IGNORECASE) newdiv = new_tag(soup, 'div') newdiv.insert(0, authstr) newdiv['class'] = 'byline' byline.replaceWith(newdiv) - for caption in soup.findAll('p', attrs={'class': re.compile('caption')}): + for caption in soup.findAll('p', attrs={'class': re.compile(r'caption')}): capstr = self.tag_to_string(caption, False) - capstr = re.sub('Photograph by.*$', '', + capstr = re.sub(r'Photograph by.*$', '', capstr, flags=re.IGNORECASE) newdiv = new_tag(soup, 'div') newdiv.insert(0, capstr) @@ -239,13 +239,13 @@ class TimesColonist(BasicNewsRecipe): except: return ans mainsoup = soup.find( - 'div', attrs={'class': re.compile('main.content')}) + 'div', attrs={'class': re.compile(r'main.content')}) article_list = [] - for wdiv in mainsoup.findAll('div', attrs={'id': re.compile('featured.story')}): + for wdiv in mainsoup.findAll('div', attrs={'id': re.compile(r'featured.story')}): for htag in wdiv.findAll('h3'): self.handle_articles(htag, article_list, sectitle) - for ladiv in mainsoup.findAll(attrs={'class': re.compile('leading.articles')}): - for wdiv in mainsoup.findAll('div', attrs={'class': re.compile('article.row')}): + for ladiv in mainsoup.findAll(attrs={'class': re.compile(r'leading.articles')}): + for wdiv in mainsoup.findAll('div', attrs={'class': re.compile(r'article.row')}): for htag in wdiv.findAll('h2'): self.handle_articles(htag, article_list, sectitle) ans.append((sectitle, article_list)) diff --git a/recipes/zeitde.recipe b/recipes/zeitde.recipe index 1deb62ba0a..6b5593432a 100644 --- a/recipes/zeitde.recipe +++ b/recipes/zeitde.recipe @@ -139,7 +139,7 @@ class ZeitDe(BasicNewsRecipe): body.insert(0, header) # Add real img tags for images - for container in soup.findAll(class_=re.compile('__media-container$')): + for container in soup.findAll(class_=re.compile(r'__media-container$')): img = container.find('noscript') if img is not None: img.name = 'div' diff --git a/recipes/zeitde_sub.recipe b/recipes/zeitde_sub.recipe index a255354257..3895c3fe8d 100644 --- a/recipes/zeitde_sub.recipe +++ b/recipes/zeitde_sub.recipe @@ -200,11 +200,11 @@ class ZeitEPUBAbo(BasicNewsRecipe): # browser.follow_link(abolink) # find page for latest issue latestlink = browser.find_link(text_regex=re.compile( - '.*ZUR AKTUELLEN AUSGABE.*')) + r'.*ZUR AKTUELLEN AUSGABE.*')) browser.follow_link(latestlink) # now find the correct file, we will still use the ePub file epublink = browser.find_link(text_regex=re.compile( - '.*EPUB F.*R E-READER LADEN.*')) # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017 + r'.*EPUB F.*R E-READER LADEN.*')) # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017 response = browser.follow_link(epublink) self.report_progress(1, _('next step')) @@ -266,11 +266,11 @@ class ZeitEPUBAbo(BasicNewsRecipe): # browser.follow_link(abolink) # find page for latest issue latestlink = browser.find_link(text_regex=re.compile( - '.*ZUR AKTUELLEN AUSGABE.*')) + r'.*ZUR AKTUELLEN AUSGABE.*')) browser.follow_link(latestlink) # actual cover search pdflink = browser.find_link(text_regex=re.compile( - '.*GESAMT-PDF LADEN.*')) + r'.*GESAMT-PDF LADEN.*')) cover_url = urlparse(pdflink.base_url)[0] + '://' + urlparse(pdflink.base_url)[1] + '' + ( urlparse(pdflink.url)[2]).replace('ePaper_', '').replace('.pdf', '_001.pdf') self.log.warning('PDF link found:') diff --git a/ruff-strict-pep8.toml b/ruff-strict-pep8.toml index ac6a054c2b..5b887c9d88 100644 --- a/ruff-strict-pep8.toml +++ b/ruff-strict-pep8.toml @@ -34,6 +34,7 @@ select = [ # preview rules 'RUF051', 'RUF056', # useless dict operation 'RUF055', # unnecessary regex + 'RUF039', # always use raw-string for regex ] [lint.per-file-ignores] @@ -46,7 +47,7 @@ select = [ "src/calibre/gui2/store/stores/*" = ['UP'] "src/calibre/gui2/tts/manager.py" = ['UP037'] "src/calibre/utils/copy_files.py" = ['UP037'] -"src/calibre/utils/smartypants.py" = ['RUF055'] +"src/calibre/utils/smartypants.py" = ['RUF039', 'RUF055'] "src/qt/*.py" = ['I'] "src/qt/*.pyi" = ['I'] diff --git a/setup/__init__.py b/setup/__init__.py index 2ced671aa4..213fa3687e 100644 --- a/setup/__init__.py +++ b/setup/__init__.py @@ -17,7 +17,7 @@ import time from contextlib import contextmanager from functools import lru_cache -iswindows = re.search('win(32|64)', sys.platform) +iswindows = re.search(r'win(32|64)', sys.platform) ismacos = 'darwin' in sys.platform isfreebsd = 'freebsd' in sys.platform isnetbsd = 'netbsd' in sys.platform diff --git a/src/calibre/db/search.py b/src/calibre/db/search.py index a871399e45..35de3e02c8 100644 --- a/src/calibre/db/search.py +++ b/src/calibre/db/search.py @@ -657,7 +657,7 @@ class Parser(SearchQueryParser): # {{{ if location == 'template': try: - template, sep, query = regex.split('#@#:([tdnb]):', query, flags=regex.IGNORECASE) + template, sep, query = regex.split(r'#@#:([tdnb]):', query, flags=regex.IGNORECASE) if sep: sep = sep.lower() else: diff --git a/src/calibre/devices/cybook/driver.py b/src/calibre/devices/cybook/driver.py index 61ba5c7018..07ef10bb77 100644 --- a/src/calibre/devices/cybook/driver.py +++ b/src/calibre/devices/cybook/driver.py @@ -34,7 +34,7 @@ class CYBOOK(USBMS): VENDOR_NAME = 'BOOKEEN' WINDOWS_MAIN_MEM = re.compile(r'CYBOOK_(OPUS|GEN3)__-FD') - WINDOWS_CARD_A_MEM = re.compile('CYBOOK_(OPUS|GEN3)__-SD') + WINDOWS_CARD_A_MEM = re.compile(r'CYBOOK_(OPUS|GEN3)__-SD') OSX_MAIN_MEM_VOL_PAT = re.compile(r'/Cybook') EBOOK_DIR_MAIN = 'eBooks' @@ -72,7 +72,7 @@ class ORIZON(CYBOOK): VENDOR_NAME = ['BOOKEEN', 'LINUX'] WINDOWS_MAIN_MEM = re.compile(r'(CYBOOK_ORIZON__-FD)|(FILE-STOR_GADGET)') - WINDOWS_CARD_A_MEM = re.compile('(CYBOOK_ORIZON__-SD)|(FILE-STOR_GADGET)') + WINDOWS_CARD_A_MEM = re.compile(r'(CYBOOK_ORIZON__-SD)|(FILE-STOR_GADGET)') EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'Digital Editions' diff --git a/src/calibre/devices/utils.py b/src/calibre/devices/utils.py index 763d967115..da71e3dbfe 100644 --- a/src/calibre/devices/utils.py +++ b/src/calibre/devices/utils.py @@ -58,11 +58,11 @@ def build_template_regexp(template): try: template = template.rpartition('/')[2] - return re.compile(re.sub('{([^}]*)}', f, template) + r'([_\d]*$)') + return re.compile(re.sub(r'{([^}]*)}', f, template) + r'([_\d]*$)') except: prints('Failed to parse template: %r'%template) template = '{title} - {authors}' - return re.compile(re.sub('{([^}]*)}', f, template) + r'([_\d]*$)') + return re.compile(re.sub(r'{([^}]*)}', f, template) + r'([_\d]*$)') def create_upload_path(mdata, fname, template, sanitize, diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index cd4b5edcec..7fe8ab3dfc 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -239,7 +239,7 @@ def generate_masthead(title, output_path=None, width=600, height=60): def escape_xpath_attr(value): if '"' in value: if "'" in value: - parts = re.split('("+)', value) + parts = re.split(r'("+)', value) ans = [] for x in parts: if x: diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py index 1be8175766..3e43f086e3 100644 --- a/src/calibre/ebooks/chm/metadata.py +++ b/src/calibre/ebooks/chm/metadata.py @@ -42,7 +42,7 @@ def _metadata_from_table(soup, searchfor): # on the home page. cue some nasty special-case hacks... if re.match(r'^\s*'+searchfor+r'\s*$', td.decode_contents(), flags=re.I): meta = _detag(td.findNextSibling('td')) - return re.sub('^:', '', meta).strip() + return re.sub(r'^:', '', meta).strip() else: meta = _detag(td) return re.sub(r'^[^:]+:', '', meta).strip() @@ -89,7 +89,7 @@ def _get_comments(soup): def _get_cover(soup, rdr): ans = None try: - ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src'] + ans = soup.find('img', alt=re.compile(r'cover', flags=re.I))['src'] except TypeError: # meeehh, no handy alt-tag goodness, try some hackery # the basic idea behind this is that in general, the cover image diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index ed3aab3ee8..54cc9afb99 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -16,7 +16,7 @@ XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' -_span_pat = re.compile('', re.DOTALL|re.IGNORECASE) +_span_pat = re.compile(r'', re.DOTALL|re.IGNORECASE) LIGATURES = { # 'Æ': 'AE', @@ -92,7 +92,7 @@ class DocAnalysis: elif format == 'pdf': linere = re.compile(r'(?<=
)(?!\s*
).*?(?=
)', re.DOTALL) elif format == 'spanned_html': - linere = re.compile('(?<=)', re.DOTALL) + linere = re.compile(r'(?<=)', re.DOTALL) elif format == 'txt': linere = re.compile('.*?\n') self.lines = linere.findall(raw) @@ -430,16 +430,16 @@ def book_designer_rules(): if ans is None: ans = book_designer_rules.ans = [ # HR - (re.compile('
', re.IGNORECASE), + (re.compile(r'
', re.IGNORECASE), lambda match : ' '), # Create header tags (re.compile(r'<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), (re.compile(r'<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), + (re.compile(r'<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), - (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), + (re.compile(r'<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), ] return ans @@ -458,7 +458,7 @@ class HTMLPreProcessor: re.IGNORECASE).search(src) is not None def is_book_designer(self, raw): - return re.search('<]*id=BookTitle', raw) is not None + return re.search(r'<]*id=BookTitle', raw) is not None def is_pdftohtml(self, src): return "" in src[:1000] diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 2a4682f48e..fbb626f069 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -27,7 +27,7 @@ class HeuristicProcessor: self.chapters_with_title = 0 self.blanks_deleted = False self.blanks_between_paragraphs = False - self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) + self.linereg = re.compile(r'(?<=)', re.IGNORECASE|re.DOTALL) self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.anyblank = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*]*>\s*

(\s*]*>\s*
\s*)*){2,}(?!\s*', re.DOTALL) + htm_end_ere = re.compile(r'', re.DOTALL) line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) htm_end = htm_end_ere.findall(raw) line_end = line_end_ere.findall(raw) @@ -209,7 +209,7 @@ class HeuristicProcessor: typical_chapters = 15000. self.min_chapters = int(ceil(wordcount / typical_chapters)) self.log.debug('minimum chapters required are: '+str(self.min_chapters)) - heading = re.compile(']*>', re.IGNORECASE) + heading = re.compile(r']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log.debug('found ' + str(self.html_preprocess_sections) + ' pre-existing headings') @@ -299,7 +299,7 @@ class HeuristicProcessor: break full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close if n_lookahead_req: - n_lookahead = re.sub('(ou|in|cha)', 'lookahead_', full_chapter_line) + n_lookahead = re.sub(r'(ou|in|cha)', 'lookahead_', full_chapter_line) if not analyze: self.log.debug('Marked ' + str(self.html_preprocess_sections) + ' headings, ' + log_message) @@ -442,7 +442,7 @@ class HeuristicProcessor: # Delete microsoft 'smart' tags html = re.sub('(?i)', '', html) # Re-open self closing paragraph tags - html = re.sub('/]*/>', '

', html) + html = re.sub(r'/]*/>', '

', html) # Get rid of empty span, bold, font, em, & italics tags fmt_tags = 'font|[ibu]|em|strong' open_fmt_pat, close_fmt_pat = fr'<(?:{fmt_tags})(?:\s[^>]*)?>', f'' @@ -462,8 +462,8 @@ class HeuristicProcessor: determines the type of html line ending used most commonly in a document use before calling docanalysis functions ''' - paras_reg = re.compile(']*>', re.IGNORECASE) - spans_reg = re.compile(']*>', re.IGNORECASE) + paras_reg = re.compile(r']*>', re.IGNORECASE) + spans_reg = re.compile(r']*>', re.IGNORECASE) paras = len(paras_reg.findall(html)) spans = len(spans_reg.findall(html)) if spans > 1: @@ -557,8 +557,8 @@ class HeuristicProcessor: def detect_soft_breaks(self, html): line = '(?P'+self.line_open+'\\s*(?P.*?)'+self.line_close+')' - line_two = '(?P'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+ \ - '\\s*(?P.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')' + line_two = '(?P'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_open)+ \ + '\\s*(?P.*?)'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_close)+')' div_break_candidate_pattern = line+'\\s*]*>\\s*
\\s*'+line_two div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE) @@ -596,8 +596,8 @@ class HeuristicProcessor: All other html is converted to text. ''' hr_open = '
' - if re.findall('(<|>)', replacement_break): - if re.match('^)', replacement_break): + if re.match(r'^\\d+).*', '\\g', replacement_break)) @@ -608,11 +608,11 @@ class HeuristicProcessor: else: replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break) divpercent = (100 - width) // 2 - hr_open = re.sub('45', str(divpercent), hr_open) + hr_open = re.sub(r'45', str(divpercent), hr_open) scene_break = hr_open+replacement_break+'
' else: scene_break = hr_open+'
' - elif re.match('^' else: from calibre.utils.html2text import html2text @@ -638,7 +638,7 @@ class HeuristicProcessor: empty_paragraph = '\n

\n' self.in_blockquote = False self.previous_was_paragraph = False - html = re.sub(']*>', '', html) + html = re.sub(r']*>', '', html) def convert_styles(match): # print('raw styles are: '+match.group('styles')) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 7b67ad15cd..c739fac934 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -91,7 +91,7 @@ class HTMLFile: HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) HTML_PAT_BIN = re.compile(br'<\s*html', re.IGNORECASE) - TITLE_PAT = re.compile('([^<>]+)', re.IGNORECASE) + TITLE_PAT = re.compile(r'([^<>]+)', re.IGNORECASE) LINK_PAT = re.compile( r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s>]+))', re.DOTALL|re.IGNORECASE) diff --git a/src/calibre/ebooks/htmlz/oeb2html.py b/src/calibre/ebooks/htmlz/oeb2html.py index a950fce2c3..52611d82b0 100644 --- a/src/calibre/ebooks/htmlz/oeb2html.py +++ b/src/calibre/ebooks/htmlz/oeb2html.py @@ -269,7 +269,7 @@ class OEB2HTMLInlineCSSizer(OEB2HTML): tag = 'div' # Add page-break-brefore: always because renders typically treat a new file (we're merging files) # as a page break and remove all other page break types that might be set. - style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a) + style_a = 'page-break-before: always; %s' % re.sub(r'page-break-[^:]+:[^;]+;?', '', style_a) # Remove unnecessary spaces. style_a = re.sub(r'\s{2,}', ' ', style_a).strip() tags.append(tag) diff --git a/src/calibre/ebooks/hyphenate.py b/src/calibre/ebooks/hyphenate.py index 13718f1cd5..6a19eaeb1a 100644 --- a/src/calibre/ebooks/hyphenate.py +++ b/src/calibre/ebooks/hyphenate.py @@ -34,8 +34,8 @@ class Hyphenator: def _insert_pattern(self, pattern): # Convert a pattern like 'a1bc3d4' into a string of chars 'abcd' # and a list of points [ 1, 0, 3, 4 ]. - chars = re.sub('[0-9]', '', pattern) - points = [int(d or 0) for d in re.split('[.a-z]', pattern)] + chars = re.sub(r'[0-9]', '', pattern) + points = [int(d or 0) for d in re.split(r'[.a-z]', pattern)] # Insert the pattern into the tree. Each character finds a dict # another level down in the tree, and leaf nodes have the list of diff --git a/src/calibre/ebooks/lrf/html/convert_from.py b/src/calibre/ebooks/lrf/html/convert_from.py index 92f2710a80..b258b64527 100644 --- a/src/calibre/ebooks/lrf/html/convert_from.py +++ b/src/calibre/ebooks/lrf/html/convert_from.py @@ -163,7 +163,7 @@ class HTMLConverter: # Fix Book Designer markup BOOK_DESIGNER = [ # HR - (re.compile('
', re.IGNORECASE), + (re.compile(r'
', re.IGNORECASE), lambda match : ' '), # Create header tags (re.compile(r'<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), @@ -279,7 +279,7 @@ class HTMLConverter: if isinstance(src, bytes): src = src.decode('utf-8', 'replace') match = self.PAGE_BREAK_PAT.search(src) - if match and not re.match('avoid', match.group(1), re.IGNORECASE): + if match and not re.match(r'avoid', match.group(1), re.IGNORECASE): self.page_break_found = True ncss, npcss = self.parse_css(src) if ncss: @@ -324,10 +324,10 @@ class HTMLConverter: def is_baen(self, soup): return bool(soup.find('meta', attrs={'name':'Publisher', - 'content':re.compile('Baen', re.IGNORECASE)})) + 'content':re.compile(r'Baen', re.IGNORECASE)})) def is_book_designer(self, raw): - return bool(re.search('<]*id=BookTitle', raw)) + return bool(re.search(r'<]*id=BookTitle', raw)) def preprocess(self, raw): nmassage = [] @@ -1152,7 +1152,7 @@ class HTMLConverter: def font_weight(val): ans = 0 - m = re.search('([0-9]+)', val) + m = re.search(r'([0-9]+)', val) if m: ans = int(m.group(1)) elif val.find('bold') >= 0 or val.find('strong') >= 0: @@ -1544,7 +1544,7 @@ class HTMLConverter: with open(path, 'rb') as f: src = f.read().decode('utf-8', 'replace') match = self.PAGE_BREAK_PAT.search(src) - if match and not re.match('avoid', match.group(1), re.IGNORECASE): + if match and not re.match(r'avoid', match.group(1), re.IGNORECASE): self.page_break_found = True ncss, npcss = self.parse_css(src) except OSError: @@ -1869,11 +1869,11 @@ def process_file(path, options, logger): header.append(fheader + ' ') book, fonts = Book(options, logger, header=header, **args) le = re.compile(options.link_exclude) if options.link_exclude else \ - re.compile('$') + re.compile(r'$') pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \ - re.compile('$') + re.compile(r'$') fpb = re.compile(options.force_page_break, re.IGNORECASE) if options.force_page_break else \ - re.compile('$') + re.compile(r'$') cq = options.chapter_attr.split(',') if len(cq) < 3: raise ValueError('The --chapter-attr setting must have 2 commas.') diff --git a/src/calibre/ebooks/lrf/html/table.py b/src/calibre/ebooks/lrf/html/table.py index 803873ed30..166bf94ebc 100644 --- a/src/calibre/ebooks/lrf/html/table.py +++ b/src/calibre/ebooks/lrf/html/table.py @@ -213,7 +213,7 @@ class Row: def __init__(self, conv, row, css, colpad): self.cells = [] self.colpad = colpad - cells = row.findAll(re.compile('td|th', re.IGNORECASE)) + cells = row.findAll(re.compile(r'td|th', re.IGNORECASE)) self.targets = [] for cell in cells: ccss = conv.tag_css(cell, css)[0] diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 8e62be2ce7..5631f8e978 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -172,7 +172,7 @@ def get_title_sort_pat(lang=None): except: ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE) else: - ans = re.compile('^$') # matches only the empty string + ans = re.compile(r'^$') # matches only the empty string _title_pats[lang] = ans return ans diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index 1499eb69c6..ac0fd27d01 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -139,7 +139,7 @@ def metadata_from_filename(name, pat=None, fallback_pat=None): try: pat = regex.compile(prefs.get('filename_pattern'), flags=regex.UNICODE | regex.VERSION0 | regex.FULLCASE) except Exception: - pat = regex.compile('(?P.+) - (?P<author>[^_]+)', flags=regex.UNICODE | regex.VERSION0 | regex.FULLCASE) + pat = regex.compile(r'(?P<title>.+) - (?P<author>[^_]+)', flags=regex.UNICODE | regex.VERSION0 | regex.FULLCASE) name = name.replace('_', ' ') match = pat.search(name) diff --git a/src/calibre/ebooks/metadata/pdb.py b/src/calibre/ebooks/metadata/pdb.py index 15b68c31ce..46673b577a 100644 --- a/src/calibre/ebooks/metadata/pdb.py +++ b/src/calibre/ebooks/metadata/pdb.py @@ -59,4 +59,4 @@ def set_metadata(stream, mi): MetadataWriter(stream, mi) stream.seek(0) - stream.write(re.sub('[^-A-Za-z0-9 ]+', '_', mi.title).ljust(31, '\x00')[:31].encode('ascii', 'replace') + b'\x00') + stream.write(re.sub(r'[^-A-Za-z0-9 ]+', '_', mi.title).ljust(31, '\x00')[:31].encode('ascii', 'replace') + b'\x00') diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 1bec72010d..b7697b3213 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -365,7 +365,7 @@ class Worker(Thread): # Get details {{{ r'([0-9.,]+) ?(out of|von|van|su|étoiles sur|つ星のうち|de un máximo de|de|av) ' r'([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas|sterren|stjärnor)){0,1}' ) - self.ratings_pat_cn = re.compile('([0-9.]+) 颗星,最多 5 颗星') + self.ratings_pat_cn = re.compile(r'([0-9.]+) 颗星,最多 5 颗星') self.ratings_pat_jp = re.compile(r'\d+つ星のうち([\d\.]+)') lm = { diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py index dbbfdef934..567eca4453 100644 --- a/src/calibre/ebooks/metadata/sources/search_engines.py +++ b/src/calibre/ebooks/metadata/sources/search_engines.py @@ -165,7 +165,7 @@ def wayback_url_processor(url): if url.startswith('/'): # Use original URL instead of absolutizing to wayback URL as wayback is # slow - m = re.search('https?:', url) + m = re.search(r'https?:', url) if m is None: url = 'https://web.archive.org' + url else: diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index 68aaf49aca..dfdc3af089 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -380,7 +380,7 @@ class MobiReader: self.processed_html = re.sub( r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html) bods = htmls = 0 - for x in re.finditer('</body>|</html>', self.processed_html): + for x in re.finditer(r'</body>|</html>', self.processed_html): if x == '</body>': bods +=1 else: diff --git a/src/calibre/ebooks/oeb/polish/main.py b/src/calibre/ebooks/oeb/polish/main.py index 005af7113c..37a3f159be 100644 --- a/src/calibre/ebooks/oeb/polish/main.py +++ b/src/calibre/ebooks/oeb/polish/main.py @@ -155,7 +155,7 @@ def hfix(name, raw): return raw -CLI_HELP = {x:hfix(x, re.sub('<.*?>', '', y)) for x, y in iteritems(HELP)} +CLI_HELP = {x:hfix(x, re.sub(r'<.*?>', '', y)) for x, y in iteritems(HELP)} # }}} diff --git a/src/calibre/ebooks/oeb/polish/spell.py b/src/calibre/ebooks/oeb/polish/spell.py index 50ed4b0f4c..e52196487a 100644 --- a/src/calibre/ebooks/oeb/polish/spell.py +++ b/src/calibre/ebooks/oeb/polish/spell.py @@ -36,7 +36,7 @@ class Patterns: # French words with prefixes are reduced to the stem word, so that the # words appear only once in the word list self.fr_elision_pat = regex.compile( - "^(?:l|d|m|t|s|j|c|ç|lorsqu|puisqu|quoiqu|qu)['’]", flags=regex.UNICODE | regex.VERSION1 | regex.IGNORECASE) + r"^(?:l|d|m|t|s|j|c|ç|lorsqu|puisqu|quoiqu|qu)['’]", flags=regex.UNICODE | regex.VERSION1 | regex.IGNORECASE) def patterns(): diff --git a/src/calibre/ebooks/oeb/transforms/rasterize.py b/src/calibre/ebooks/oeb/transforms/rasterize.py index ce56c017c1..3604dda111 100644 --- a/src/calibre/ebooks/oeb/transforms/rasterize.py +++ b/src/calibre/ebooks/oeb/transforms/rasterize.py @@ -102,7 +102,7 @@ class SVGRasterizer: if view_box is not None: try: - box = [float(x) for x in filter(None, re.split('[, ]', view_box))] + box = [float(x) for x in filter(None, re.split(r'[, ]', view_box))] sizes = [box[2]-box[0], box[3] - box[1]] except (TypeError, ValueError, IndexError): logger.warn('SVG image has invalid viewBox="%s", ignoring the viewBox' % view_box) diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 8b9e9a6a17..7f0aa9d12f 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -152,7 +152,7 @@ def flip_image(img, flip): def flip_images(raw): - for match in re.finditer('<IMG[^>]+/?>', raw, flags=re.I): + for match in re.finditer(r'<IMG[^>]+/?>', raw, flags=re.I): img = match.group() m = re.search(r'class="(x|y|xy)flip"', img) if m is None: @@ -174,5 +174,5 @@ def flip_images(raw): counter += 1 return m.group(1).rstrip('/') + f' alt="Image {counter}"/>' - raw = re.sub('(<IMG[^>]+)/?>', add_alt, raw, flags=re.I) + raw = re.sub(r'(<IMG[^>]+)/?>', add_alt, raw, flags=re.I) return raw diff --git a/src/calibre/ebooks/pdf/render/fonts.py b/src/calibre/ebooks/pdf/render/fonts.py index b7a8fc510f..2f0e5235da 100644 --- a/src/calibre/ebooks/pdf/render/fonts.py +++ b/src/calibre/ebooks/pdf/render/fonts.py @@ -121,7 +121,7 @@ class Font: self.metrics, self.compress = metrics, compress self.is_otf = self.metrics.is_otf self.subset_tag = str( - re.sub('.', lambda m: codepoint_to_chr(int(m.group())+ord('A')), oct(num).replace('o', '') + re.sub(r'.', lambda m: codepoint_to_chr(int(m.group())+ord('A')), oct(num).replace('o', '') )).rjust(6, 'A') self.font_stream = FontStream(metrics.is_otf, compress=compress) try: diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index f963956285..b787010ba4 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -199,11 +199,11 @@ class PMLMLizer: text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text) # Remove excess spaces at beginning and end of lines - text = re.sub('(?m)^[ ]+', '', text) - text = re.sub('(?m)[ ]+$', '', text) + text = re.sub(r'(?m)^[ ]+', '', text) + text = re.sub(r'(?m)[ ]+$', '', text) # Remove excessive spaces - text = re.sub('[ ]{2,}', ' ', text) + text = re.sub(r'[ ]{2,}', ' ', text) # Condense excessive \c empty line sequences. text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text) @@ -213,7 +213,7 @@ class PMLMLizer: if self.opts.remove_paragraph_spacing: text = re.sub('\n{2,}', '\n', text) # Only indent lines that don't have special formatting - text = re.sub('(?imu)^(?P<text>.+)$', lambda mo: mo.group('text') + text = re.sub(r'(?imu)^(?P<text>.+)$', lambda mo: mo.group('text') if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text) else: text = re.sub('\n{3,}', '\n\n', text) diff --git a/src/calibre/ebooks/readability/readability.py b/src/calibre/ebooks/readability/readability.py index e1195efcce..dc4e7f0225 100644 --- a/src/calibre/ebooks/readability/readability.py +++ b/src/calibre/ebooks/readability/readability.py @@ -19,11 +19,11 @@ def tounicode(tree_or_node, **kwargs): REGEXES = { - 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I), # noqa: E501 - 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I), - 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I), - 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I), # noqa: E501 - 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I), + 'unlikelyCandidatesRe': re.compile(r'combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I), # noqa: E501 + 'okMaybeItsACandidateRe': re.compile(r'and|article|body|column|main|shadow',re.I), + 'positiveRe': re.compile(r'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I), + 'negativeRe': re.compile(r'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I), # noqa: E501 + 'divToPElementsRe': re.compile(r'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I), # 'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I), # 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), # 'trimRe': re.compile('^\s+|\s+$/'), diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index 28c3784d94..2559fb516a 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -121,7 +121,7 @@ class RTFMLizer: self.log.debug('Converting %s to RTF markup...' % item.href) # Removing comments is needed as comments with -- inside them can # cause fromstring() to fail - content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL) + content = re.sub(r'<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL) content = self.remove_newlines(content) content = self.remove_tabs(content) content = safe_xml_fromstring(content) @@ -198,7 +198,7 @@ class RTFMLizer: text = re.sub('%s{3,}' % os.linesep, f'{os.linesep}{os.linesep}', text) # Remove excessive spaces - text = re.sub('[ ]{2,}', ' ', text) + text = re.sub(r'[ ]{2,}', ' ', text) text = re.sub('\t{2,}', '\t', text) text = text.replace('\t ', '\t') diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index ce2ce876ec..26b6dfe799 100644 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -652,7 +652,7 @@ class ProcessTokens: return f'cw<{pre}<{token}<nu<{type}\n' def __language_func(self, pre, token, num): - lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group())) + lang_name = self.__language_dict.get(int(re.search(r'[0-9]+', num).group())) if not lang_name: lang_name = 'not defined' if self.__run_level > 3: diff --git a/src/calibre/ebooks/snb/snbml.py b/src/calibre/ebooks/snb/snbml.py index 4cd82498da..54da31fcbb 100644 --- a/src/calibre/ebooks/snb/snbml.py +++ b/src/calibre/ebooks/snb/snbml.py @@ -165,13 +165,13 @@ class SNBMLizer: text = re.sub('\n[ ]+\n', '\n\n', text) if self.opts.remove_paragraph_spacing: text = re.sub('\n{2,}', '\n', text) - text = re.sub('(?imu)^(?=.)', '\t', text) + text = re.sub(r'(?imu)^(?=.)', '\t', text) else: text = re.sub('\n{3,}', '\n\n', text) # Replace spaces at the beginning and end of lines - text = re.sub('(?imu)^[ ]+', '', text) - text = re.sub('(?imu)[ ]+$', '', text) + text = re.sub(r'(?imu)^[ ]+', '', text) + text = re.sub(r'(?imu)[ ]+$', '', text) if self.opts.snb_max_line_length: max_length = self.opts.snb_max_line_length diff --git a/src/calibre/ebooks/textile/unsmarten.py b/src/calibre/ebooks/textile/unsmarten.py index 6481b436f4..c140da9894 100644 --- a/src/calibre/ebooks/textile/unsmarten.py +++ b/src/calibre/ebooks/textile/unsmarten.py @@ -6,117 +6,117 @@ import re def unsmarten(txt): - txt = re.sub('¢|¢|¢', r'{c\}', txt) # cent - txt = re.sub('£|£|£', r'{L-}', txt) # pound - txt = re.sub('¥|¥|¥', r'{Y=}', txt) # yen - txt = re.sub('©|©|©', r'{(c)}', txt) # copyright - txt = re.sub('®|®|®', r'{(r)}', txt) # registered - txt = re.sub('¼|¼|¼', r'{1/4}', txt) # quarter - txt = re.sub('½|½|½', r'{1/2}', txt) # half - txt = re.sub('¾|¾|¾', r'{3/4}', txt) # three-quarter - txt = re.sub('À|À|À', r'{A`)}', txt) # A-grave - txt = re.sub('Á|Á|Á', r"{A'}", txt) # A-acute - txt = re.sub('Â|Â|Â', r'{A^}', txt) # A-circumflex - txt = re.sub('Ã|Ã|Ã', r'{A~}', txt) # A-tilde - txt = re.sub('Ä|Ä|Ä', r'{A"}', txt) # A-umlaut - txt = re.sub('Å|Å|Å', r'{Ao}', txt) # A-ring - txt = re.sub('Æ|Æ|Æ', r'{AE}', txt) # AE - txt = re.sub('Ç|Ç|Ç', r'{C,}', txt) # C-cedilla - txt = re.sub('È|È|È', r'{E`}', txt) # E-grave - txt = re.sub('É|É|É', r"{E'}", txt) # E-acute - txt = re.sub('Ê|Ê|Ê', r'{E^}', txt) # E-circumflex - txt = re.sub('Ë|Ë|Ë', r'{E"}', txt) # E-umlaut - txt = re.sub('Ì|Ì|Ì', r'{I`}', txt) # I-grave - txt = re.sub('Í|Í|Í', r"{I'}", txt) # I-acute - txt = re.sub('Î|Î|Î', r'{I^}', txt) # I-circumflex - txt = re.sub('Ï|Ï|Ï', r'{I"}', txt) # I-umlaut - txt = re.sub('Ð|Ð|Ð', r'{D-}', txt) # ETH - txt = re.sub('Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde - txt = re.sub('Ò|Ò|Ò', r'{O`}', txt) # O-grave - txt = re.sub('Ó|Ó|Ó', r"{O'}", txt) # O-acute - txt = re.sub('Ô|Ô|Ô', r'{O^}', txt) # O-circumflex - txt = re.sub('Õ|Õ|Õ', r'{O~}', txt) # O-tilde - txt = re.sub('Ö|Ö|Ö', r'{O"}', txt) # O-umlaut - txt = re.sub('×|×|×', r'{x}', txt) # dimension - txt = re.sub('Ø|Ø|Ø', r'{O/}', txt) # O-slash - txt = re.sub('Ù|Ù|Ù', r'{U`}', txt) # U-grave - txt = re.sub('Ú|Ú|Ú', r"{U'}", txt) # U-acute - txt = re.sub('Û|Û|Û', r'{U^}', txt) # U-circumflex - txt = re.sub('Ü|Ü|Ü', r'{U"}', txt) # U-umlaut - txt = re.sub('Ý|Ý|Ý', r"{Y'}", txt) # Y-grave - txt = re.sub('ß|ß|ß', r'{sz}', txt) # sharp-s - txt = re.sub('à|à|à', r'{a`}', txt) # a-grave - txt = re.sub('á|á|á', r"{a'}", txt) # a-acute - txt = re.sub('â|â|â', r'{a^}', txt) # a-circumflex - txt = re.sub('ã|ã|ã', r'{a~}', txt) # a-tilde - txt = re.sub('ä|ä|ä', r'{a"}', txt) # a-umlaut - txt = re.sub('å|å|å', r'{ao}', txt) # a-ring - txt = re.sub('æ|æ|æ', r'{ae}', txt) # ae - txt = re.sub('ç|ç|ç', r'{c,}', txt) # c-cedilla - txt = re.sub('è|è|è', r'{e`}', txt) # e-grave - txt = re.sub('é|é|é', r"{e'}", txt) # e-acute - txt = re.sub('ê|ê|ê', r'{e^}', txt) # e-circumflex - txt = re.sub('ë|ë|ë', r'{e"}', txt) # e-umlaut - txt = re.sub('ì|ì|ì', r'{i`}', txt) # i-grave - txt = re.sub('í|í|í', r"{i'}", txt) # i-acute - txt = re.sub('î|î|î', r'{i^}', txt) # i-circumflex - txt = re.sub('ï|ï|ï', r'{i"}', txt) # i-umlaut - txt = re.sub('ð|ð|ð', r'{d-}', txt) # eth - txt = re.sub('ñ|ñ|ñ', r'{n~}', txt) # n-tilde - txt = re.sub('ò|ò|ò', r'{o`}', txt) # o-grave - txt = re.sub('ó|ó|ó', r"{o'}", txt) # o-acute - txt = re.sub('ô|ô|ô', r'{o^}', txt) # o-circumflex - txt = re.sub('õ|õ|õ', r'{o~}', txt) # o-tilde - txt = re.sub('ö|ö|ö', r'{o"}', txt) # o-umlaut - txt = re.sub('ø|ø|ø', r'{o/}', txt) # o-stroke - txt = re.sub('ù|ù|ù', r'{u`}', txt) # u-grave - txt = re.sub('ú|ú|ú', r"{u'}", txt) # u-acute - txt = re.sub('û|û|û', r'{u^}', txt) # u-circumflex - txt = re.sub('ü|ü|ü', r'{u"}', txt) # u-umlaut - txt = re.sub('ý|ý|ý', r"{y'}", txt) # y-acute - txt = re.sub('ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut + txt = re.sub(r'¢|¢|¢', r'{c\}', txt) # cent + txt = re.sub(r'£|£|£', r'{L-}', txt) # pound + txt = re.sub(r'¥|¥|¥', r'{Y=}', txt) # yen + txt = re.sub(r'©|©|©', r'{(c)}', txt) # copyright + txt = re.sub(r'®|®|®', r'{(r)}', txt) # registered + txt = re.sub(r'¼|¼|¼', r'{1/4}', txt) # quarter + txt = re.sub(r'½|½|½', r'{1/2}', txt) # half + txt = re.sub(r'¾|¾|¾', r'{3/4}', txt) # three-quarter + txt = re.sub(r'À|À|À', r'{A`)}', txt) # A-grave + txt = re.sub(r'Á|Á|Á', r"{A'}", txt) # A-acute + txt = re.sub(r'Â|Â|Â', r'{A^}', txt) # A-circumflex + txt = re.sub(r'Ã|Ã|Ã', r'{A~}', txt) # A-tilde + txt = re.sub(r'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut + txt = re.sub(r'Å|Å|Å', r'{Ao}', txt) # A-ring + txt = re.sub(r'Æ|Æ|Æ', r'{AE}', txt) # AE + txt = re.sub(r'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla + txt = re.sub(r'È|È|È', r'{E`}', txt) # E-grave + txt = re.sub(r'É|É|É', r"{E'}", txt) # E-acute + txt = re.sub(r'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex + txt = re.sub(r'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut + txt = re.sub(r'Ì|Ì|Ì', r'{I`}', txt) # I-grave + txt = re.sub(r'Í|Í|Í', r"{I'}", txt) # I-acute + txt = re.sub(r'Î|Î|Î', r'{I^}', txt) # I-circumflex + txt = re.sub(r'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut + txt = re.sub(r'Ð|Ð|Ð', r'{D-}', txt) # ETH + txt = re.sub(r'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde + txt = re.sub(r'Ò|Ò|Ò', r'{O`}', txt) # O-grave + txt = re.sub(r'Ó|Ó|Ó', r"{O'}", txt) # O-acute + txt = re.sub(r'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex + txt = re.sub(r'Õ|Õ|Õ', r'{O~}', txt) # O-tilde + txt = re.sub(r'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut + txt = re.sub(r'×|×|×', r'{x}', txt) # dimension + txt = re.sub(r'Ø|Ø|Ø', r'{O/}', txt) # O-slash + txt = re.sub(r'Ù|Ù|Ù', r'{U`}', txt) # U-grave + txt = re.sub(r'Ú|Ú|Ú', r"{U'}", txt) # U-acute + txt = re.sub(r'Û|Û|Û', r'{U^}', txt) # U-circumflex + txt = re.sub(r'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut + txt = re.sub(r'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave + txt = re.sub(r'ß|ß|ß', r'{sz}', txt) # sharp-s + txt = re.sub(r'à|à|à', r'{a`}', txt) # a-grave + txt = re.sub(r'á|á|á', r"{a'}", txt) # a-acute + txt = re.sub(r'â|â|â', r'{a^}', txt) # a-circumflex + txt = re.sub(r'ã|ã|ã', r'{a~}', txt) # a-tilde + txt = re.sub(r'ä|ä|ä', r'{a"}', txt) # a-umlaut + txt = re.sub(r'å|å|å', r'{ao}', txt) # a-ring + txt = re.sub(r'æ|æ|æ', r'{ae}', txt) # ae + txt = re.sub(r'ç|ç|ç', r'{c,}', txt) # c-cedilla + txt = re.sub(r'è|è|è', r'{e`}', txt) # e-grave + txt = re.sub(r'é|é|é', r"{e'}", txt) # e-acute + txt = re.sub(r'ê|ê|ê', r'{e^}', txt) # e-circumflex + txt = re.sub(r'ë|ë|ë', r'{e"}', txt) # e-umlaut + txt = re.sub(r'ì|ì|ì', r'{i`}', txt) # i-grave + txt = re.sub(r'í|í|í', r"{i'}", txt) # i-acute + txt = re.sub(r'î|î|î', r'{i^}', txt) # i-circumflex + txt = re.sub(r'ï|ï|ï', r'{i"}', txt) # i-umlaut + txt = re.sub(r'ð|ð|ð', r'{d-}', txt) # eth + txt = re.sub(r'ñ|ñ|ñ', r'{n~}', txt) # n-tilde + txt = re.sub(r'ò|ò|ò', r'{o`}', txt) # o-grave + txt = re.sub(r'ó|ó|ó', r"{o'}", txt) # o-acute + txt = re.sub(r'ô|ô|ô', r'{o^}', txt) # o-circumflex + txt = re.sub(r'õ|õ|õ', r'{o~}', txt) # o-tilde + txt = re.sub(r'ö|ö|ö', r'{o"}', txt) # o-umlaut + txt = re.sub(r'ø|ø|ø', r'{o/}', txt) # o-stroke + txt = re.sub(r'ù|ù|ù', r'{u`}', txt) # u-grave + txt = re.sub(r'ú|ú|ú', r"{u'}", txt) # u-acute + txt = re.sub(r'û|û|û', r'{u^}', txt) # u-circumflex + txt = re.sub(r'ü|ü|ü', r'{u"}', txt) # u-umlaut + txt = re.sub(r'ý|ý|ý', r"{y'}", txt) # y-acute + txt = re.sub(r'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut - txt = re.sub('Č|Č|Č', r'{Cˇ}', txt) # C-caron - txt = re.sub('č|č|č', r'{cˇ}', txt) # c-caron - txt = re.sub('Ď|Ď|Ď', r'{Dˇ}', txt) # D-caron - txt = re.sub('ď|ď|ď', r'{dˇ}', txt) # d-caron - txt = re.sub('Ě|Ě|Ě', r'{Eˇ}', txt) # E-caron - txt = re.sub('ě|ě|ě', r'{eˇ}', txt) # e-caron - txt = re.sub('Ĺ|Ĺ|Ĺ', r"{L'}", txt) # L-acute - txt = re.sub('ĺ|ĺ|ĺ', r"{l'}", txt) # l-acute - txt = re.sub('Ľ|Ľ|Ľ', r'{Lˇ}', txt) # L-caron - txt = re.sub('ľ|ľ|ľ', r'{lˇ}', txt) # l-caron - txt = re.sub('Ň|Ň|Ň', r'{Nˇ}', txt) # N-caron - txt = re.sub('ň|ň|ň', r'{nˇ}', txt) # n-caron + txt = re.sub(r'Č|Č|Č', r'{Cˇ}', txt) # C-caron + txt = re.sub(r'č|č|č', r'{cˇ}', txt) # c-caron + txt = re.sub(r'Ď|Ď|Ď', r'{Dˇ}', txt) # D-caron + txt = re.sub(r'ď|ď|ď', r'{dˇ}', txt) # d-caron + txt = re.sub(r'Ě|Ě|Ě', r'{Eˇ}', txt) # E-caron + txt = re.sub(r'ě|ě|ě', r'{eˇ}', txt) # e-caron + txt = re.sub(r'Ĺ|Ĺ|Ĺ', r"{L'}", txt) # L-acute + txt = re.sub(r'ĺ|ĺ|ĺ', r"{l'}", txt) # l-acute + txt = re.sub(r'Ľ|Ľ|Ľ', r'{Lˇ}', txt) # L-caron + txt = re.sub(r'ľ|ľ|ľ', r'{lˇ}', txt) # l-caron + txt = re.sub(r'Ň|Ň|Ň', r'{Nˇ}', txt) # N-caron + txt = re.sub(r'ň|ň|ň', r'{nˇ}', txt) # n-caron - txt = re.sub('Œ|Œ|Œ', r'{OE}', txt) # OE - txt = re.sub('œ|œ|œ', r'{oe}', txt) # oe + txt = re.sub(r'Œ|Œ|Œ', r'{OE}', txt) # OE + txt = re.sub(r'œ|œ|œ', r'{oe}', txt) # oe - txt = re.sub('Ŕ|Ŕ|Ŕ', r"{R'}", txt) # R-acute - txt = re.sub('ŕ|ŕ|ŕ', r"{r'}", txt) # r-acute - txt = re.sub('Ř|Ř|Ř', r'{Rˇ}', txt) # R-caron - txt = re.sub('ř|ř|ř', r'{rˇ}', txt) # r-caron - txt = re.sub('Ŝ|Ŝ', r'{S^}', txt) # S-circumflex - txt = re.sub('ŝ|ŝ', r'{s^}', txt) # s-circumflex - txt = re.sub('Š|Š|Š', r'{Sˇ}', txt) # S-caron - txt = re.sub('š|š|š', r'{sˇ}', txt) # s-caron - txt = re.sub('Ť|Ť|Ť', r'{Tˇ}', txt) # T-caron - txt = re.sub('ť|ť|ť', r'{tˇ}', txt) # t-caron - txt = re.sub('Ů|Ů|Ů', r'{U°}', txt) # U-ring - txt = re.sub('ů|ů|ů', r'{u°}', txt) # u-ring - txt = re.sub('Ž|Ž|Ž', r'{Zˇ}', txt) # Z-caron - txt = re.sub('ž|ž|ž', r'{zˇ}', txt) # z-caron + txt = re.sub(r'Ŕ|Ŕ|Ŕ', r"{R'}", txt) # R-acute + txt = re.sub(r'ŕ|ŕ|ŕ', r"{r'}", txt) # r-acute + txt = re.sub(r'Ř|Ř|Ř', r'{Rˇ}', txt) # R-caron + txt = re.sub(r'ř|ř|ř', r'{rˇ}', txt) # r-caron + txt = re.sub(r'Ŝ|Ŝ', r'{S^}', txt) # S-circumflex + txt = re.sub(r'ŝ|ŝ', r'{s^}', txt) # s-circumflex + txt = re.sub(r'Š|Š|Š', r'{Sˇ}', txt) # S-caron + txt = re.sub(r'š|š|š', r'{sˇ}', txt) # s-caron + txt = re.sub(r'Ť|Ť|Ť', r'{Tˇ}', txt) # T-caron + txt = re.sub(r'ť|ť|ť', r'{tˇ}', txt) # t-caron + txt = re.sub(r'Ů|Ů|Ů', r'{U°}', txt) # U-ring + txt = re.sub(r'ů|ů|ů', r'{u°}', txt) # u-ring + txt = re.sub(r'Ž|Ž|Ž', r'{Zˇ}', txt) # Z-caron + txt = re.sub(r'ž|ž|ž', r'{zˇ}', txt) # z-caron - txt = re.sub('•|•|•', r'{*}', txt) # bullet - txt = re.sub('₣|₣', r'{Fr}', txt) # Franc - txt = re.sub('₤|₤', r'{L=}', txt) # Lira - txt = re.sub('₨|₨', r'{Rs}', txt) # Rupee - txt = re.sub('€|€|€', r'{C=}', txt) # euro - txt = re.sub('™|™|™', r'{tm}', txt) # trademark - txt = re.sub('♠|♠|♠', r'{spade}', txt) # spade - txt = re.sub('♣|♣|♣', r'{club}', txt) # club - txt = re.sub('♥|♥|♥', r'{heart}', txt) # heart - txt = re.sub('♦|♦|♦', r'{diamond}', txt) # diamond + txt = re.sub(r'•|•|•', r'{*}', txt) # bullet + txt = re.sub(r'₣|₣', r'{Fr}', txt) # Franc + txt = re.sub(r'₤|₤', r'{L=}', txt) # Lira + txt = re.sub(r'₨|₨', r'{Rs}', txt) # Rupee + txt = re.sub(r'€|€|€', r'{C=}', txt) # euro + txt = re.sub(r'™|™|™', r'{tm}', txt) # trademark + txt = re.sub(r'♠|♠|♠', r'{spade}', txt) # spade + txt = re.sub(r'♣|♣|♣', r'{club}', txt) # club + txt = re.sub(r'♥|♥|♥', r'{heart}', txt) # heart + txt = re.sub(r'♦|♦|♦', r'{diamond}', txt) # diamond # Move into main code? # txt = re.sub('\xa0', r'p. ', txt) # blank paragraph diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 53465bde6c..0e8d7512c8 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -51,9 +51,9 @@ class MarkdownMLizer(OEB2HTML): def tidy_up(self, text): # Remove blank space form beginning of paragraph. - text = re.sub('(?msu)^[ ]{1,3}', '', text) + text = re.sub(r'(?msu)^[ ]{1,3}', '', text) # pre has 4 spaces. We trimmed 3 so anything with a space left is a pre. - text = re.sub('(?msu)^[ ]', ' ', text) + text = re.sub(r'(?msu)^[ ]', ' ', text) # Remove tabs that aren't at the beginning of a line new_text = [] @@ -68,7 +68,7 @@ class MarkdownMLizer(OEB2HTML): text = '\n'.join(new_text) # Remove spaces from blank lines. - text = re.sub('(?msu)^[ ]+$', '', text) + text = re.sub(r'(?msu)^[ ]+$', '', text) # Reduce blank lines text = re.sub('(?msu)\n{7,}', '\n' * 6, text) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 08988eaddc..c818cfe2f2 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -34,7 +34,7 @@ def clean_txt(txt): txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt) # Condense redundant spaces - txt = re.sub('[ ]{2,}', ' ', txt) + txt = re.sub(r'[ ]{2,}', ' ', txt) # Remove blank space from the beginning and end of the document. txt = re.sub(r'^\s+(?=.)', '', txt) @@ -213,7 +213,7 @@ def preserve_spaces(txt): ''' Replaces spaces multiple spaces with   entities. ''' - txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt) + txt = re.sub(r'(?P<space>[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt) txt = txt.replace('\t', '    ') return txt @@ -325,9 +325,9 @@ def detect_formatting_type(txt): # Check for markdown # Headings - markdown_count += len(re.findall('(?mu)^#+', txt)) - markdown_count += len(re.findall('(?mu)^=+$', txt)) - markdown_count += len(re.findall('(?mu)^-+$', txt)) + markdown_count += len(re.findall(r'(?mu)^#+', txt)) + markdown_count += len(re.findall(r'(?mu)^=+$', txt)) + markdown_count += len(re.findall(r'(?mu)^-+$', txt)) # Images markdown_count += len(re.findall(r'(?u)!\[.*?\](\[|\()', txt)) # Links diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 4c5a39c72e..9c811f2d77 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -126,7 +126,7 @@ class TXTMLizer: text = re.sub('(?<=.)\n(?=.)', ' ', text) # Remove multiple spaces. - text = re.sub('[ ]{2,}', ' ', text) + text = re.sub(r'[ ]{2,}', ' ', text) # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) @@ -140,8 +140,8 @@ class TXTMLizer: # Replace spaces at the beginning and end of lines # We don't replace tabs because those are only added # when remove paragraph spacing is enabled. - text = re.sub('(?imu)^[ ]+', '', text) - text = re.sub('(?imu)[ ]+$', '', text) + text = re.sub(r'(?imu)^[ ]+', '', text) + text = re.sub(r'(?imu)[ ]+$', '', text) # Remove empty space and newlines at the beginning of the document. text = re.sub(r'(?u)^[ \n]+', '', text) diff --git a/src/calibre/gui2/dialogs/search.py b/src/calibre/gui2/dialogs/search.py index 7dad89f7d2..d5fb9bd5b4 100644 --- a/src/calibre/gui2/dialogs/search.py +++ b/src/calibre/gui2/dialogs/search.py @@ -406,7 +406,7 @@ class SearchDialog(QDialog): self.resize(self.sizeHint()) def retrieve_template_search(self): - template, sep, query = re.split('#@#:([tdnb]):', self.current_search_text, flags=re.IGNORECASE) + template, sep, query = re.split(r'#@#:([tdnb]):', self.current_search_text, flags=re.IGNORECASE) self.template_value_box.setText(query) cb = self.template_test_type_box for idx in range(0, cb.count()): diff --git a/src/calibre/gui2/preferences/create_custom_column.py b/src/calibre/gui2/preferences/create_custom_column.py index fcee648991..3cf7511135 100644 --- a/src/calibre/gui2/preferences/create_custom_column.py +++ b/src/calibre/gui2/preferences/create_custom_column.py @@ -744,7 +744,7 @@ class CreateCustomColumn(QDialog): return self.simple_error('', _('The colors box must be empty or ' 'contain the same number of items as the value box')) for tc in c: - if tc not in QColor.colorNames() and not re.match('#(?:[0-9a-f]{3}){1,4}',tc,re.I): + if tc not in QColor.colorNames() and not re.match(r'#(?:[0-9a-f]{3}){1,4}',tc,re.I): return self.simple_error('', _('The color {0} is unknown').format(tc)) display_dict = {'enum_values': l, 'enum_colors': c} if default_val: diff --git a/src/calibre/gui2/preferences/emailp.py b/src/calibre/gui2/preferences/emailp.py index 2ae7ca1530..dbd058c302 100644 --- a/src/calibre/gui2/preferences/emailp.py +++ b/src/calibre/gui2/preferences/emailp.py @@ -146,7 +146,7 @@ class EmailAccounts(QAbstractTableModel): # {{{ if aval: self.tags[account] = aval elif col == 1: - self.accounts[account][0] = re.sub(',+', ',', re.sub(r'\s+', ',', as_unicode(value or '').upper())) + self.accounts[account][0] = re.sub(r',+', ',', re.sub(r'\s+', ',', as_unicode(value or '').upper())) elif col == 0: na = as_unicode(value or '').strip() from email.utils import parseaddr diff --git a/src/calibre/gui2/tweak_book/editor/smarts/html.py b/src/calibre/gui2/tweak_book/editor/smarts/html.py index 139610fa15..aa2a1c4df7 100644 --- a/src/calibre/gui2/tweak_book/editor/smarts/html.py +++ b/src/calibre/gui2/tweak_book/editor/smarts/html.py @@ -920,6 +920,6 @@ if __name__ == '__main__': # {{{ def callback(ed): import regex - ed.find_text(regex.compile('A bold word')) + ed.find_text(regex.compile(r'A bold word')) launch_editor(raw, path_is_raw=True, syntax='html', callback=callback) # }}} diff --git a/src/calibre/library/catalogs/epub_mobi_builder.py b/src/calibre/library/catalogs/epub_mobi_builder.py index 884e0250de..22031334a3 100644 --- a/src/calibre/library/catalogs/epub_mobi_builder.py +++ b/src/calibre/library/catalogs/epub_mobi_builder.py @@ -3828,7 +3828,7 @@ class CatalogBuilder: # if self.opts.numbers_as_text and re.match('[0-9]+',word[0]): translated.append(NumberToText(word).text.capitalize()) else: - if re.match('[0-9]+', word[0]): + if re.match(r'[0-9]+', word[0]): word = word.replace(',', '') suffix = re.search(r'[\D]', word) if suffix: @@ -3844,7 +3844,7 @@ class CatalogBuilder: translated.append(capitalize(word)) else: - if re.search('[0-9]+', word[0]): + if re.search(r'[0-9]+', word[0]): word = word.replace(',', '') suffix = re.search(r'[\D]', word) if suffix: @@ -4114,7 +4114,7 @@ class CatalogBuilder: Return: (str): char if A-z, else SYMBOLS ''' - if not re.search('[a-zA-Z]', ascii_text(char)): + if not re.search(r'[a-zA-Z]', ascii_text(char)): return self.SYMBOLS else: return char diff --git a/src/calibre/library/catalogs/utils.py b/src/calibre/library/catalogs/utils.py index 967b5faff4..b297db3cf9 100644 --- a/src/calibre/library/catalogs/utils.py +++ b/src/calibre/library/catalogs/utils.py @@ -87,7 +87,7 @@ class NumberToText: # {{{ self.log('numberTranslate(): %s' % self.number) # Special case ordinals - if re.search('[st|nd|rd|th]',self.number): + if re.search(r'[st|nd|rd|th]',self.number): self.number = self.number.replace(',', '') ordinal_suffix = re.search(r'[\D]', self.number) ordinal_number = re.sub(r'\D','',self.number.replace(',', '')) @@ -134,7 +134,7 @@ class NumberToText: # {{{ self.log('Hyphenated: %s' % self.number) self.number_as_float = self.number.split('-')[0] strings = self.number.split('-') - if re.search('[0-9]+', strings[0]): + if re.search(r'[0-9]+', strings[0]): left = NumberToText(strings[0]).text right = strings[1] else: @@ -143,7 +143,7 @@ class NumberToText: # {{{ self.text = f'{left}-{right}' # Test for only commas and numbers - elif ',' in self.number and not re.search('[^0-9,]',self.number): + elif ',' in self.number and not re.search(r'[^0-9,]',self.number): if self.verbose: self.log('Comma(s): %s' % self.number) self.number_as_float = self.number.replace(',', '') diff --git a/src/calibre/library/database.py b/src/calibre/library/database.py index 2058cc6c2a..4fe0d807c9 100644 --- a/src/calibre/library/database.py +++ b/src/calibre/library/database.py @@ -1504,11 +1504,11 @@ def text_to_tokens(text): text = match.group(1) OR = True tokens = [] - quot = re.search('"(.*?)"', text) + quot = re.search(r'"(.*?)"', text) while quot: tokens.append(quot.group(1)) text = text.replace('"'+quot.group(1)+'"', '') - quot = re.search('"(.*?)"', text) + quot = re.search(r'"(.*?)"', text) tokens += text.split(' ') ans = [] for i in tokens: diff --git a/src/calibre/utils/bibtex.py b/src/calibre/utils/bibtex.py index c959a63c4b..e3f48f84c4 100644 --- a/src/calibre/utils/bibtex.py +++ b/src/calibre/utils/bibtex.py @@ -2556,7 +2556,7 @@ class BibTeX: self.invalid_cit = re.compile('[ "@\',\\#}{~%&$^]') self.upper = re.compile('[' + string.ascii_uppercase + ']') - self.escape = re.compile('[#&%_]') + self.escape = re.compile(r'[#&%_]') def ValidateCitationKey(self, text): ''' diff --git a/src/calibre/utils/complete.py b/src/calibre/utils/complete.py index 38387a7ade..144416e2a0 100644 --- a/src/calibre/utils/complete.py +++ b/src/calibre/utils/complete.py @@ -59,7 +59,7 @@ def get_opts_from_parser(parser, prefix): def send(ans): - pat = re.compile('([^0-9a-zA-Z_./-])') + pat = re.compile(r'([^0-9a-zA-Z_./-])') for x in sorted(set(ans)): x = pat.sub(lambda m : '\\'+m.group(1), x) if x.endswith('\\ '): diff --git a/src/calibre/utils/date.py b/src/calibre/utils/date.py index 045350002b..60a52aac5b 100644 --- a/src/calibre/utils/date.py +++ b/src/calibre/utils/date.py @@ -384,7 +384,7 @@ def format_date(dt, format, assume_utc=False, as_utc=False): repl_func = partial(fd_repl_func, dt, 'ap' in format.lower()) return re.sub( - '(s{1,2})|(m{1,2})|(h{1,2})|(ap)|(AP)|(d{1,4}|M{1,4}|(?:yyyy|yy))', + r'(s{1,2})|(m{1,2})|(h{1,2})|(ap)|(AP)|(d{1,4}|M{1,4}|(?:yyyy|yy))', repl_func, format) # }}} @@ -460,7 +460,7 @@ def clean_date_for_sort(dt, fmt=None): 'min':UNDEFINED_DATE.minute, 'sec':UNDEFINED_DATE.second} repl_func = partial(cd_repl_func, tt, dt) - re.sub('(s{1,2})|(m{1,2})|(h{1,2})|(d{1,4}|M{1,4}|(?:yyyy|yy))', repl_func, fmt) + re.sub(r'(s{1,2})|(m{1,2})|(h{1,2})|(d{1,4}|M{1,4}|(?:yyyy|yy))', repl_func, fmt) return dt.replace(year=tt['year'], month=tt['mon'], day=tt['day'], hour=tt['hour'], minute=tt['min'], second=tt['sec'], microsecond=0) # }}} diff --git a/src/calibre/utils/localization.py b/src/calibre/utils/localization.py index 863c101ec7..94ead6b017 100644 --- a/src/calibre/utils/localization.py +++ b/src/calibre/utils/localization.py @@ -90,7 +90,7 @@ def get_system_locale(): def sanitize_lang(lang): if lang: - match = re.match('[a-z]{2,3}(_[A-Z]{2}){0,1}', lang) + match = re.match(r'[a-z]{2,3}(_[A-Z]{2}){0,1}', lang) if match: lang = match.group() if lang == 'zh': diff --git a/src/calibre/utils/search_query_parser.py b/src/calibre/utils/search_query_parser.py index 08b887a6f8..e55efebf76 100644 --- a/src/calibre/utils/search_query_parser.py +++ b/src/calibre/utils/search_query_parser.py @@ -195,7 +195,7 @@ class Parser: def tokenize(self, expr): # convert docstrings to base64 to avoid all processing. Change the docstring # indicator to something unique with no characters special to the parser. - expr = re.sub('(""")(..*?)(""")', + expr = re.sub(r'(""")(..*?)(""")', lambda mo: self.docstring_sep + as_hex_unicode(mo.group(2)) + self.docstring_sep, expr, flags=re.DOTALL) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 545b4ebeb8..78282a1d99 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -1730,7 +1730,7 @@ class BasicNewsRecipe(Recipe): def error_in_article_download(self, request, traceback): self.jobs_done += 1 - if traceback and re.search('^AbortArticle:', traceback, flags=re.M) is not None: + if traceback and re.search(r'^AbortArticle:', traceback, flags=re.M) is not None: self.log.warn('Aborted download of article:', request.article.title, 'from', request.article.url) self.report_progress(float(self.jobs_done)/len(self.jobs), diff --git a/src/odf/easyliststyle.py b/src/odf/easyliststyle.py index 42f4097058..81dcc2cc7a 100644 --- a/src/odf/easyliststyle.py +++ b/src/odf/easyliststyle.py @@ -59,7 +59,7 @@ def styleFromList(styleName, specArray, spacing, showAllLevels): numbered = False displayLevels = 0 listStyle = ListStyle(name=styleName) - numFormatPattern = re.compile('([1IiAa])') + numFormatPattern = re.compile(r'([1IiAa])') cssLengthPattern = re.compile('([^a-z]+)\\s*([a-z]+)?') m = cssLengthPattern.search(spacing) if (m is not None):