diff --git a/recipes/android_com_pl.recipe b/recipes/android_com_pl.recipe index c62f1ceef4..6265f25011 100644 --- a/recipes/android_com_pl.recipe +++ b/recipes/android_com_pl.recipe @@ -16,5 +16,5 @@ class Android_com_pl(BasicNewsRecipe): remove_tags_after = [{'class': 'post-content'}] remove_tags = [dict(name='ul', attrs={'class': 'tags small-tags'}), dict(name='a', attrs={'onclick': 'return ss_plugin_loadpopup_js(this);'})] preprocess_regexps = [ - (re.compile(u'

.{,1}

', re.DOTALL), lambda match: '')] + (re.compile(r'

.{,1}

', re.DOTALL), lambda match: '')] feeds = [(u'Android', u'http://android.com.pl/feed/')] diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index f96f3364aa..e3e0aef637 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -17,8 +17,8 @@ class BenchmarkPl(BasicNewsRecipe): extra_css = 'ul {list-style-type: none;}' no_stylesheets = True use_embedded_content = False - preprocess_regexps = [(re.compile(u'

 Zobacz poprzednie Opinie dnia:.*', # noqa: E501 - re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Więcej o .*?', re.DOTALL | re.IGNORECASE), lambda match: '')] + preprocess_regexps = [(re.compile(u'

 Zobacz poprzednie Opinie dnia:.*', # noqa: E501, RUF039 + re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Więcej o .*?', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: RUF039 keep_only_tags = [dict(id=['articleHeader', 'articleGallery']), dict( name='div', attrs={'class': ['m_zwykly', 'gallery']}), dict(id='article')] diff --git a/recipes/blic.recipe b/recipes/blic.recipe index fbd2e463bd..ec44e53123 100644 --- a/recipes/blic.recipe +++ b/recipes/blic.recipe @@ -41,7 +41,7 @@ class Blic(BasicNewsRecipe): 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 remove_tags_before = dict(name='div', attrs={'id': 'article_info'}) remove_tags = [ dict(name=['object', 'link', 'meta', 'base', 'object', 'embed'])] diff --git a/recipes/cvecezla.recipe b/recipes/cvecezla.recipe index 0ae2f8f60e..6a330cb194 100644 --- a/recipes/cvecezla.recipe +++ b/recipes/cvecezla.recipe @@ -27,7 +27,7 @@ class CveceZla(BasicNewsRecipe): 'comment': description, 'tags': 'igre, muzika, film, blog, Srbija', 'publisher': 'Mehmet Krljic', 'language': language } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 remove_tags_before = dict(attrs={'class': 'navigation'}) remove_tags_after = dict(attrs={'class': 'commentlist'}) diff --git a/recipes/daum_net.recipe b/recipes/daum_net.recipe index c1adb61c21..be4b3f15b6 100644 --- a/recipes/daum_net.recipe +++ b/recipes/daum_net.recipe @@ -61,7 +61,7 @@ class MediaDaumRecipe(BasicNewsRecipe): lambda match: ''), (re.compile(r'(]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE), lambda match: ''), - (re.compile(u'(]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\\(c\\))*\\[[^\\]]*(\u24D2|\\(c\\)|\uAE30\uC0AC|\uC778\uAE30[^\\]]*\uB274\uC2A4)[^\\]]*\\].*', re.DOTALL | re.IGNORECASE), # noqa: E501 + (re.compile(u'(]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\\(c\\))*\\[[^\\]]*(\u24D2|\\(c\\)|\uAE30\uC0AC|\uC778\uAE30[^\\]]*\uB274\uC2A4)[^\\]]*\\].*', re.DOTALL | re.IGNORECASE), # noqa: E501, RUF039 lambda match: ''), ] diff --git a/recipes/dnevnik_cro.recipe b/recipes/dnevnik_cro.recipe index d2e3303763..6ace469209 100644 --- a/recipes/dnevnik_cro.recipe +++ b/recipes/dnevnik_cro.recipe @@ -42,7 +42,7 @@ class DnevnikCro(BasicNewsRecipe): 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 keep_only_tags = [dict(name='div', attrs={'id': 'article'})] diff --git a/recipes/dziennik_wschodni.recipe b/recipes/dziennik_wschodni.recipe index 9c6c04a214..18cd71d6aa 100644 --- a/recipes/dziennik_wschodni.recipe +++ b/recipes/dziennik_wschodni.recipe @@ -20,8 +20,8 @@ class DziennikWschodni(BasicNewsRecipe): no_stylesheets = True ignore_duplicate_articles = {'title', 'url'} - preprocess_regexps = [(re.compile(u'Czytaj:.*?', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa: E501 - (re.compile(u'Przeczytaj również:.*?', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: E501 + preprocess_regexps = [(re.compile(u'Czytaj:.*?', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa: E501, RUF039 + (re.compile(u'Przeczytaj również:.*?', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: E501, RUF039 keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', diff --git a/recipes/esenja.recipe b/recipes/esenja.recipe index 42f43f7192..0417b5d05f 100644 --- a/recipes/esenja.recipe +++ b/recipes/esenja.recipe @@ -45,9 +45,7 @@ class Esensja(BasicNewsRecipe): ''' preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), - (re.compile( - u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), - ] + (re.compile(r'(title|alt)="[^"]*?"', re.DOTALL), lambda match: '')] def parse_index(self): soup = self.index_to_soup('http://www.esensja.pl/magazyn/') diff --git a/recipes/esensja_(rss).recipe b/recipes/esensja_(rss).recipe index 6ce8b697e1..55630cf6a7 100644 --- a/recipes/esensja_(rss).recipe +++ b/recipes/esensja_(rss).recipe @@ -23,9 +23,7 @@ class EsensjaRSS(BasicNewsRecipe): remove_javascript = True ignore_duplicate_articles = {'title', 'url'} preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), - (re.compile( - u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), - ] + (re.compile(r'(title|alt)="[^"]*?"', re.DOTALL), lambda match: '')] remove_attributes = ['style', 'bgcolor', 'alt', 'color'] keep_only_tags = [dict(attrs={'class': 'sekcja'}), ] remove_tags_after = dict(id='tekst') diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 77c57458be..da8eede0b3 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -23,7 +23,7 @@ class FilmWebPl(BasicNewsRecipe): 'ul.sep-line > li + li::before {content: " | "} ' 'ul.inline {padding:0px;} .vertical-align {display: inline-block;}') preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags... - (re.compile(u'(?:)?\\(kliknij\\,\\ aby powiększyć\\)(?:)?', re.IGNORECASE), lambda m: ''), + (re.compile(u'(?:)?\\(kliknij\\,\\ aby powiększyć\\)(?:)?', re.IGNORECASE), lambda m: ''), # noqa: RUF039 (re.compile(type(u'')(r'(
\s*?
\s*?)+'), re.IGNORECASE), lambda m: '
') ] remove_tags = [dict(attrs={'class':['infoParent', 'likeBar', diff --git a/recipes/gazeta_pl_krakow.recipe b/recipes/gazeta_pl_krakow.recipe index 2469536fd2..cc3067dec3 100644 --- a/recipes/gazeta_pl_krakow.recipe +++ b/recipes/gazeta_pl_krakow.recipe @@ -34,7 +34,7 @@ class gw_krakow(BasicNewsRecipe): # rules for gazeta.pl preprocess_regexps = [ - (re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] + (re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] # noqa: RUF039 keep_only_tags = [dict(id='gazeta_article')] remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict( attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] diff --git a/recipes/gazeta_pl_warszawa.recipe b/recipes/gazeta_pl_warszawa.recipe index a0fa38e851..df3dcd3b40 100644 --- a/recipes/gazeta_pl_warszawa.recipe +++ b/recipes/gazeta_pl_warszawa.recipe @@ -33,7 +33,7 @@ class gw_wawa(BasicNewsRecipe): # rules for gazeta.pl preprocess_regexps = [ - (re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] + (re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] # noqa: RUF039 keep_only_tags = [dict(id='gazeta_article')] remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict( attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] diff --git a/recipes/granta.recipe b/recipes/granta.recipe index 989d4d9700..a0597bdc4e 100644 --- a/recipes/granta.recipe +++ b/recipes/granta.recipe @@ -54,11 +54,11 @@ def solve_captcha(captcha): # Parse into parts pattern = re.compile( - u'(?P[0-9]+)?' - u'\\s*(?P[+×−])\\s*' - u'(?P[0-9]+)' - u'\\s*(=)\\s*' - u'(?P[0-9]+)?', re.UNICODE) + r'(?P[0-9]+)?' + u'\\s*(?P[+×−])\\s*' # noqa: RUF039 + r'(?P[0-9]+)' + r'\s*(=)\s*' + r'(?P[0-9]+)?', re.UNICODE) calculationParts = re.search(pattern, numeric_problem) if calculationParts is None: diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe index b80b1255a9..b4b8331b35 100644 --- a/recipes/in4_pl.recipe +++ b/recipes/in4_pl.recipe @@ -16,7 +16,7 @@ class in4(BasicNewsRecipe): no_stylesheets = True remove_empty_feeds = True preprocess_regexps = [ - (re.compile(u'', re.DOTALL), lambda match: '')] keep_only_tags = [dict(name='div', attrs={'class': 'left_alone'})] remove_tags_after = dict(name='img', attrs={'title': 'komentarze'}) remove_tags = [dict(name='img', attrs={'title': 'komentarze'})] diff --git a/recipes/kopalniawiedzy.recipe b/recipes/kopalniawiedzy.recipe index f0e8ce9354..5b67df334b 100644 --- a/recipes/kopalniawiedzy.recipe +++ b/recipes/kopalniawiedzy.recipe @@ -29,9 +29,9 @@ class KopalniaWiedzy(BasicNewsRecipe): extra_css = '.topimage {margin-top: 30px}' preprocess_regexps = [ - (re.compile(u''), + (re.compile(r''), lambda match: ''), - (re.compile(u'

'), + (re.compile(r'

'), lambda match: '
') ] diff --git a/recipes/le_monde_sub_paper.recipe b/recipes/le_monde_sub_paper.recipe index e2651ab6d0..698f36e33f 100644 --- a/recipes/le_monde_sub_paper.recipe +++ b/recipes/le_monde_sub_paper.recipe @@ -66,7 +66,7 @@ class LeMondeAbonne(BasicNewsRecipe): dict(name='div', attrs={'class': 'po-copy'}) ] - article_id_pattern = re.compile('[0-9]+\\.html') + article_id_pattern = re.compile(r'[0-9]+\.html') article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/' def get_browser(self): diff --git a/recipes/nacional_cro.recipe b/recipes/nacional_cro.recipe index 97333d6c9b..6dd5f6d9ac 100644 --- a/recipes/nacional_cro.recipe +++ b/recipes/nacional_cro.recipe @@ -43,7 +43,7 @@ class NacionalCro(BasicNewsRecipe): 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 remove_tags = [dict(name=['object', 'link', 'embed'])] diff --git a/recipes/natemat_pl.recipe b/recipes/natemat_pl.recipe index ebb69ed476..8ecb9c6c16 100644 --- a/recipes/natemat_pl.recipe +++ b/recipes/natemat_pl.recipe @@ -11,8 +11,8 @@ class NaTemat(BasicNewsRecipe): description = u'informacje, komentarze, opinie' category = 'news' language = 'pl' - preprocess_regexps = [(re.compile(u'Czytaj też\\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(u'Zobacz też\\:.*?', re.IGNORECASE), lambda m: ''), # noqa: E501 - (re.compile(u'Czytaj więcej\\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(u'Czytaj również\\:.*?', re.IGNORECASE), lambda m: '')] # noqa: E501 + preprocess_regexps = [(re.compile(u'Czytaj też\\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(u'Zobacz też\\:.*?', re.IGNORECASE), lambda m: ''), # noqa: E501, RUF039 + (re.compile(u'Czytaj więcej\\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(u'Czytaj również\\:.*?', re.IGNORECASE), lambda m: '')] # noqa: E501, RUF039 cover_url = 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png' no_stylesheets = True keep_only_tags = [dict(id='main')] diff --git a/recipes/njuz_net.recipe b/recipes/njuz_net.recipe index bd9ae24395..4977ea42a4 100644 --- a/recipes/njuz_net.recipe +++ b/recipes/njuz_net.recipe @@ -34,7 +34,7 @@ class NjuzNet(BasicNewsRecipe): 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 keep_only_tags = [ dict(attrs={'id': 'entryMeta'}), dict(attrs={'class': 'post'}) diff --git a/recipes/novilist_novine_hr.recipe b/recipes/novilist_novine_hr.recipe index eeb3c1e6f6..61e6f78e71 100644 --- a/recipes/novilist_novine_hr.recipe +++ b/recipes/novilist_novine_hr.recipe @@ -36,7 +36,7 @@ class NoviList_hr(BasicNewsRecipe): p{display: block} ''' - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True diff --git a/recipes/novosti.recipe b/recipes/novosti.recipe index 2c926ab51b..442b2b4c5c 100644 --- a/recipes/novosti.recipe +++ b/recipes/novosti.recipe @@ -35,7 +35,7 @@ class Novosti(BasicNewsRecipe): 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'pretty_print': True } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 keep_only_tags = [dict(attrs={'class': [ 'articleTitle', 'articleInfo', 'articleLead', 'singlePhoto fl', 'articleBody']})] diff --git a/recipes/nspm.recipe b/recipes/nspm.recipe index f533f4e5fc..a05107c411 100644 --- a/recipes/nspm.recipe +++ b/recipes/nspm.recipe @@ -45,7 +45,7 @@ class Nspm(BasicNewsRecipe): 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'pretty_print': True } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 remove_tags = [dict(name=['link', 'script', 'meta', 'base', 'img'])] remove_attributes = ['width', 'height', 'lang', 'xmlns:fb', 'xmlns:og', 'vspace', 'hspace', 'type', 'start', 'size'] diff --git a/recipes/observatorul_cultural.recipe b/recipes/observatorul_cultural.recipe index 304024a53c..07aa0d3635 100644 --- a/recipes/observatorul_cultural.recipe +++ b/recipes/observatorul_cultural.recipe @@ -31,8 +31,7 @@ class ObservatorulCultural(BasicNewsRecipe): def parse_index(self): soup = self.index_to_soup( 'http://www.observatorcultural.ro/Arhiva*-archive.html') - issueTag = soup.find('a', href=re.compile( - 'observatorcultural.ro\\/Numarul')) + issueTag = soup.find('a', href=re.compile(r'observatorcultural.ro/Numarul')) issueURL = issueTag['href'] print(issueURL) issueSoup = self.index_to_soup(issueURL) diff --git a/recipes/pescanik.recipe b/recipes/pescanik.recipe index 157bb4b7e2..b5978c82a6 100644 --- a/recipes/pescanik.recipe +++ b/recipes/pescanik.recipe @@ -34,7 +34,7 @@ class Pescanik(BasicNewsRecipe): 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 remove_tags = [ dict(name=['object', 'link', 'meta', 'script', 'iframe', 'embed'])] keep_only_tags = [ diff --git a/recipes/root.recipe b/recipes/root.recipe index 61a2f2c025..d3b129ab7c 100644 --- a/recipes/root.recipe +++ b/recipes/root.recipe @@ -31,9 +31,9 @@ class AdvancedUserRecipe1289939440(BasicNewsRecipe): ] preprocess_regexps = [ - (re.compile(u'

[^<]*]*>', + (re.compile(r'

[^<]*]*>', re.DOTALL), lambda match: '

'), - (re.compile(u'

Tričko tučňák.*', + (re.compile(u'

Tričko tučňák.*', # noqa: RUF039 re.DOTALL), lambda match: '') ] diff --git a/recipes/rts.recipe b/recipes/rts.recipe index 3ed5fba7d6..310feba470 100644 --- a/recipes/rts.recipe +++ b/recipes/rts.recipe @@ -38,7 +38,7 @@ class RTS(BasicNewsRecipe): 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 feeds = [ diff --git a/recipes/swiatkindle.recipe b/recipes/swiatkindle.recipe index 0616e4ff83..95e289f81f 100644 --- a/recipes/swiatkindle.recipe +++ b/recipes/swiatkindle.recipe @@ -25,4 +25,4 @@ class swiatczytnikow(BasicNewsRecipe): dict(name='div', attrs={'class': 'feedflare'})] preprocess_regexps = [ - (re.compile(u'

Czytaj dalej:

'), lambda match: '')] + (re.compile(u'

Czytaj dalej:

'), lambda match: '')] # noqa: RUF039 diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index 43781dc3a1..76c363a87f 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -15,8 +15,8 @@ class Tablety_pl(BasicNewsRecipe): no_stylesheets = True oldest_article = 8 max_articles_per_feed = 100 - preprocess_regexps = [(re.compile(u'

Przeczytaj także.*?

', re.DOTALL), lambda match: ''), - (re.compile(u'

Przeczytaj koniecznie.*?

', re.DOTALL), lambda match: '')] + preprocess_regexps = [(re.compile(u'

Przeczytaj także.*?

', re.DOTALL), lambda match: ''), # noqa: RUF039 + (re.compile(u'

Przeczytaj koniecznie.*?

', re.DOTALL), lambda match: '')] # noqa: RUF039 keep_only_tags = [dict(attrs={'class': ['featured-image', 'article-content clearfix']})] remove_tags = [dict(attrs={'class': ['comments_icon', 'wp-polls', 'entry-comments', 'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer', 'social-custom']})] diff --git a/recipes/thecultofghoul.recipe b/recipes/thecultofghoul.recipe index 77da5d4299..72a30a9ee2 100644 --- a/recipes/thecultofghoul.recipe +++ b/recipes/thecultofghoul.recipe @@ -27,7 +27,7 @@ class TheCultOfGhoul(BasicNewsRecipe): 'comment': description, 'tags': 'film, blog, srbija, strava, uzas', 'publisher': 'Dejan Ognjanovic', 'language': language } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 feeds = [(u'Posts', u'http://cultofghoul.blogspot.com/feeds/posts/default')] diff --git a/recipes/vecernji_list.recipe b/recipes/vecernji_list.recipe index 5bcd540f18..3f39424eb4 100644 --- a/recipes/vecernji_list.recipe +++ b/recipes/vecernji_list.recipe @@ -43,7 +43,7 @@ class VecernjiList(BasicNewsRecipe): 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 remove_tags = [ dict(name=['object', 'link', 'embed']), dict( diff --git a/recipes/vreme.recipe b/recipes/vreme.recipe index fc83944a97..febbabf1bb 100644 --- a/recipes/vreme.recipe +++ b/recipes/vreme.recipe @@ -41,7 +41,7 @@ class Vreme(BasicNewsRecipe): 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039 remove_tags_before = dict(attrs={'class': 'toc-heading'}) remove_tags_after = dict(attrs={'class': 'footer'}) diff --git a/recipes/wnp.recipe b/recipes/wnp.recipe index c4ddbf0cca..7cba0290e3 100644 --- a/recipes/wnp.recipe +++ b/recipes/wnp.recipe @@ -10,8 +10,8 @@ class WNP(BasicNewsRecipe): description = u'Wirtualny Nowy Przemysł' category = 'economy' language = 'pl' - preprocess_regexps = [(re.compile(u'Czytaj też:.*?', re.DOTALL), lambda match: ''), - (re.compile(u'Czytaj więcej:.*?', re.DOTALL), lambda match: '')] + preprocess_regexps = [(re.compile(u'Czytaj też:.*?', re.DOTALL), lambda match: ''), # noqa: RUF039 + (re.compile(u'Czytaj więcej:.*?', re.DOTALL), lambda match: '')] # noqa: RUF039 oldest_article = 8 max_articles_per_feed = 100 no_stylesheets = True diff --git a/recipes/zeitde_sub.recipe b/recipes/zeitde_sub.recipe index 3895c3fe8d..f3b044fa32 100644 --- a/recipes/zeitde_sub.recipe +++ b/recipes/zeitde_sub.recipe @@ -48,7 +48,7 @@ class ZeitEPUBAbo(BasicNewsRecipe): preprocess_regexps = [ # filtering for correct dashes ("Gedankenstrich" and "bis") - (re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'), + (re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'), # noqa: RUF039 (re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number (re.compile(r'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro # fix the number dash number dash for the title image that was broken @@ -130,9 +130,9 @@ class ZeitEPUBAbo(BasicNewsRecipe): (re.compile( r'(?<=

[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''), # before closing quotation - (re.compile(u' \u00AB'), lambda match: u'\u00AB '), + (re.compile(u' \u00AB'), lambda match: u'\u00AB '), # noqa: RUF039 # after opening quotation - (re.compile(u'\u00BB '), lambda match: u' \u00BB'), + (re.compile(u'\u00BB '), lambda match: u' \u00BB'), # noqa: RUF039 # filtering for spaces in large numbers for better readability # end of the number with some character following (re.compile(r'(?<=\d\d)(?=\d\d\d[ ,;\)<\?!-])'), @@ -151,25 +151,25 @@ class ZeitEPUBAbo(BasicNewsRecipe): # filtering for unicode characters that are missing on the Kindle, # try to replace them with meaningful work-arounds # subscript-0 - (re.compile(u'\u2080'), lambda match: '0'), + (re.compile(u'\u2080'), lambda match: '0'), # noqa: RUF039 # subscript-1 - (re.compile(u'\u2081'), lambda match: '1'), + (re.compile(u'\u2081'), lambda match: '1'), # noqa: RUF039 # subscript-2 - (re.compile(u'\u2082'), lambda match: '2'), + (re.compile(u'\u2082'), lambda match: '2'), # noqa: RUF039 # subscript-3 - (re.compile(u'\u2083'), lambda match: '3'), + (re.compile(u'\u2083'), lambda match: '3'), # noqa: RUF039 # subscript-4 - (re.compile(u'\u2084'), lambda match: '4'), + (re.compile(u'\u2084'), lambda match: '4'), # noqa: RUF039 # subscript-5 - (re.compile(u'\u2085'), lambda match: '5'), + (re.compile(u'\u2085'), lambda match: '5'), # noqa: RUF039 # subscript-6 - (re.compile(u'\u2086'), lambda match: '6'), + (re.compile(u'\u2086'), lambda match: '6'), # noqa: RUF039 # subscript-7 - (re.compile(u'\u2087'), lambda match: '7'), + (re.compile(u'\u2087'), lambda match: '7'), # noqa: RUF039 # subscript-8 - (re.compile(u'\u2088'), lambda match: '8'), + (re.compile(u'\u2088'), lambda match: '8'), # noqa: RUF039 # subscript-9 - (re.compile(u'\u2089'), lambda match: '9'), + (re.compile(u'\u2089'), lambda match: '9'), # noqa: RUF039 # always chance CO2 (re.compile(r'CO2'), lambda match: 'CO2'), # CO2 # remove *** paragraphs diff --git a/src/calibre/devices/kindle/apnx_page_generator/generators/pagebreak_page_generator.py b/src/calibre/devices/kindle/apnx_page_generator/generators/pagebreak_page_generator.py index 433961b1eb..0d8870ebb7 100644 --- a/src/calibre/devices/kindle/apnx_page_generator/generators/pagebreak_page_generator.py +++ b/src/calibre/devices/kindle/apnx_page_generator/generators/pagebreak_page_generator.py @@ -21,7 +21,7 @@ class PagebreakPageGenerator(IPageGenerator): ''' Determine pages based on the presence of <*pagebreak*/>. ''' html = mobi_html(mobi_file_path) pages = [] - for m in re.finditer(b'<[^>]*pagebreak[^>]*>', html): + for m in re.finditer(br'<[^>]*pagebreak[^>]*>', html): pages.append(m.end()) return Pages(pages) diff --git a/src/calibre/ebooks/compression/tcr.py b/src/calibre/ebooks/compression/tcr.py index 76e63e958e..e509e19d69 100644 --- a/src/calibre/ebooks/compression/tcr.py +++ b/src/calibre/ebooks/compression/tcr.py @@ -32,7 +32,7 @@ class TCRCompressor: The intent is to create more unused codes. ''' possible_codes = [] - a_code = set(re.findall(b'(?ms).', self.coded_txt)) + a_code = set(re.findall(br'(?ms).', self.coded_txt)) for code in a_code: single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt)) @@ -57,7 +57,7 @@ class TCRCompressor: ''' Create new codes from codes that occur in pairs often. ''' - possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt))) + possible_new_codes = list(set(re.findall(br'(?ms)..', self.coded_txt))) new_codes_count = [] for c in possible_new_codes: @@ -74,7 +74,7 @@ class TCRCompressor: def compress(self, txt): self._reset() - self.codes = list(set(re.findall(b'(?ms).', txt))) + self.codes = list(set(re.findall(br'(?ms).', txt))) # Replace the text with their corresponding code # FIXME: python3 is native bytearray, but all we want are bytes diff --git a/src/calibre/ebooks/conversion/plugins/epub_input.py b/src/calibre/ebooks/conversion/plugins/epub_input.py index a2404dfd8c..a3222e685f 100644 --- a/src/calibre/ebooks/conversion/plugins/epub_input.py +++ b/src/calibre/ebooks/conversion/plugins/epub_input.py @@ -46,7 +46,7 @@ class EPUBInput(InputFormatPlugin): from lxml import etree idpf_key = opf.raw_unique_identifier if idpf_key: - idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key) + idpf_key = re.sub(r'[\u0020\u0009\u000d\u000a]', '', idpf_key) idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest() key = None for item in opf.identifier_iter(): diff --git a/src/calibre/ebooks/conversion/plugins/epub_output.py b/src/calibre/ebooks/conversion/plugins/epub_output.py index 6a41d42858..5f06404363 100644 --- a/src/calibre/ebooks/conversion/plugins/epub_output.py +++ b/src/calibre/ebooks/conversion/plugins/epub_output.py @@ -503,7 +503,7 @@ class EPUBOutput(OutputFormatPlugin): tag.tag = XHTML('div') # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces - special_chars = re.compile('[\u200b\u00ad]') + special_chars = re.compile(r'[\u200b\u00ad]') for elem in root.iterdescendants('*'): if elem.text: elem.text = special_chars.sub('', elem.text) diff --git a/src/calibre/ebooks/conversion/plugins/rtf_input.py b/src/calibre/ebooks/conversion/plugins/rtf_input.py index 5809e6fc7a..fdf0052022 100644 --- a/src/calibre/ebooks/conversion/plugins/rtf_input.py +++ b/src/calibre/ebooks/conversion/plugins/rtf_input.py @@ -296,7 +296,7 @@ class RTFInput(InputFormatPlugin): res = as_bytes(transform.tostring(result)) # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] # clean multiple \n - res = re.sub(b'\n+', b'\n', res) + res = re.sub(br'\n+', b'\n', res) # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines # res = re.sub('\s*', '', res) # res = re.sub('(?<=\n)\n{2}', diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 54cc9afb99..f76c70b04b 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -94,7 +94,7 @@ class DocAnalysis: elif format == 'spanned_html': linere = re.compile(r'(?<=)', re.DOTALL) elif format == 'txt': - linere = re.compile('.*?\n') + linere = re.compile(r'.*?\n') self.lines = linere.findall(raw) def line_length(self, percent): diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index fbb626f069..ac1fb1daf1 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -57,8 +57,8 @@ class HeuristicProcessor: ' chapters. - ' + str(chap)) return '

'+chap+'

\n' else: - delete_whitespace = re.compile('^\\s*(?P.*?)\\s*$') - delete_quotes = re.compile('\'"') + delete_whitespace = re.compile(r'^\s*(?P.*?)\s*$') + delete_quotes = re.compile(r'\'"') txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g', html2text(chap))) txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g', html2text(title))) self.html_preprocess_sections = self.html_preprocess_sections + 1 @@ -109,7 +109,7 @@ class HeuristicProcessor: be marked up to return true. ''' htm_end_ere = re.compile(r'', re.DOTALL) - line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) + line_end_ere = re.compile(r'(\n|\r|\r\n)', re.DOTALL) htm_end = htm_end_ere.findall(raw) line_end = line_end_ere.findall(raw) tot_htm_ends = len(htm_end) @@ -417,7 +417,7 @@ class HeuristicProcessor: # Add markup naively # TODO - find out if there are cases where there are more than one
 tag or
             # other types of unmarked html and handle them in some better fashion
-            add_markup = re.compile('(?)(\n)')
+            add_markup = re.compile(r'(?)(\n)')
             html = add_markup.sub('

\n

', html) return html @@ -440,7 +440,7 @@ class HeuristicProcessor: # Get rid of empty tags to simplify other processing html = re.sub(r'\s*\s*', ' ', html) # Delete microsoft 'smart' tags - html = re.sub('(?i)', '', html) + html = re.sub(r'(?i)', '', html) # Re-open self closing paragraph tags html = re.sub(r'/]*/>', '

', html) # Get rid of empty span, bold, font, em, & italics tags @@ -451,7 +451,7 @@ class HeuristicProcessor: html = re.sub( r'\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}'.format(open=open_fmt_pat, close=close_fmt_pat) , ' ', html) # delete surrounding divs from empty paragraphs - html = re.sub(']*>\\s*]*>\\s*

\\s*', '

', html) + html = re.sub(r']*>\s*]*>\s*

\s*', '

', html) # Empty heading tags html = re.sub(r'(?i)\s*', '', html) self.deleted_nbsps = True @@ -538,7 +538,7 @@ class HeuristicProcessor: elif content.find('scenebreak') != -1: return content else: - content = re.sub('(?i)\\d+)[^>]*>', '\n\n'+' style="'+top_margin+bottom_margin+'">', content) + content = re.sub(r'(?i)\d+)[^>]*>', r'\n\n style="'+top_margin+bottom_margin+'">', content) return content html = blanks_around_headings.sub(merge_header_whitespace, html) @@ -551,7 +551,7 @@ class HeuristicProcessor: html = blanks_n_nopunct.sub(markup_whitespaces, html) if self.html_preprocess_sections > self.min_chapters: - html = re.sub('(?si)^.*?(?=\\d+).*', '\\g', replacement_break)) + width = int(re.sub(r'.*?width(:|=)(?P\d+).*', r'\g', replacement_break)) except: scene_break = hr_open+'
' self.log.warn('Invalid replacement scene break' ' expression, using default') else: - replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break) + replacement_break = re.sub(r'(?i)(width=\d+\\%?|width:\s*\d+(\%|px|pt|em)?;?)', '', replacement_break) divpercent = (100 - width) // 2 hr_open = re.sub(r'45', str(divpercent), hr_open) scene_break = hr_open+replacement_break+'' @@ -617,24 +617,24 @@ class HeuristicProcessor: else: from calibre.utils.html2text import html2text replacement_break = html2text(replacement_break) - replacement_break = re.sub('\\s', ' ', replacement_break) + replacement_break = re.sub(r'\s', ' ', replacement_break) scene_break = self.scene_break_open+replacement_break+'

' else: - replacement_break = re.sub('\\s', ' ', replacement_break) + replacement_break = re.sub(r'\s', ' ', replacement_break) scene_break = self.scene_break_open+replacement_break+'

' return scene_break def check_paragraph(self, content): - content = re.sub('\\s*]*>\\s*', '', content) - if re.match('.*["\'.!?:]$', content): + content = re.sub(r'\s*]*>\s*', '', content) + if re.match(r'.*["\'.!?:]$', content): # print('detected this as a paragraph') return True else: return False def abbyy_processor(self, html): - abbyy_line = re.compile('((?P[^"]*?);?">)(?P.*?)(?P

)|(?P]*>))', re.IGNORECASE) + abbyy_line = re.compile(r'((?P[^"]*?);?">)(?P.*?)(?P

)|(?P]*>))', re.IGNORECASE) empty_paragraph = '\n

\n' self.in_blockquote = False self.previous_was_paragraph = False @@ -680,7 +680,7 @@ class HeuristicProcessor: if style == 'text-align' and setting != 'left': text_align = style+':'+setting+';' if style == 'text-indent': - setting = int(re.sub('\\s*pt\\s*', '', setting)) + setting = int(re.sub(r'\s*pt\s*', '', setting)) if 9 < setting < 14: text_indent = indented_text else: @@ -853,7 +853,7 @@ class HeuristicProcessor: # If non-blank scene breaks exist they are center aligned and styled with appropriate margins. if getattr(self.extra_opts, 'format_scene_breaks', False): self.log.debug('Formatting scene breaks') - html = re.sub('(?i)]*>\\s*\\s*', '

', html) + html = re.sub(r'(?i)]*>\s*\s*', '

', html) html = self.detect_scene_breaks(html) html = self.detect_whitespace(html) html = self.detect_soft_breaks(html) @@ -870,9 +870,9 @@ class HeuristicProcessor: replacement_break = self.markup_user_break(replacement_break) if scene_break_count >= 1: html = detected_scene_break.sub(replacement_break, html) - html = re.sub(']*>\\s*

', replacement_break, html) + html = re.sub(r']*>\s*

', replacement_break, html) else: - html = re.sub(']*>\\s*

', replacement_break, html) + html = re.sub(r']*>\s*

', replacement_break, html) if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs so they render correctly diff --git a/src/calibre/ebooks/docx/writer/from_html.py b/src/calibre/ebooks/docx/writer/from_html.py index b82e6ca7aa..c0b53ad756 100644 --- a/src/calibre/ebooks/docx/writer/from_html.py +++ b/src/calibre/ebooks/docx/writer/from_html.py @@ -62,7 +62,7 @@ class TextRun: self.first_html_parent = first_html_parent if self.ws_pat is None: TextRun.ws_pat = self.ws_pat = re.compile(r'\s+') - TextRun.soft_hyphen_pat = self.soft_hyphen_pat = re.compile('(\u00ad)') + TextRun.soft_hyphen_pat = self.soft_hyphen_pat = re.compile(r'(\u00ad)') self.style = style self.texts = [] self.link = None diff --git a/src/calibre/ebooks/metadata/pml.py b/src/calibre/ebooks/metadata/pml.py index 0fa91dc071..e0df26a905 100644 --- a/src/calibre/ebooks/metadata/pml.py +++ b/src/calibre/ebooks/metadata/pml.py @@ -42,21 +42,21 @@ def get_metadata(stream, extract_cover=True): for comment in re.findall(br'(?ms)\\v.*?\\v', pml): m = re.search(br'TITLE="(.*?)"', comment) if m: - mi.title = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) + mi.title = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) m = re.search(br'AUTHOR="(.*?)"', comment) if m: if mi.authors == [_('Unknown')]: mi.authors = [] - mi.authors.append(re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))) + mi.authors.append(re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))) m = re.search(br'PUBLISHER="(.*?)"', comment) if m: - mi.publisher = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) + mi.publisher = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) m = re.search(br'COPYRIGHT="(.*?)"', comment) if m: - mi.rights = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) + mi.rights = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) m = re.search(br'ISBN="(.*?)"', comment) if m: - mi.isbn = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) + mi.isbn = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) return mi diff --git a/src/calibre/ebooks/metadata/txt.py b/src/calibre/ebooks/metadata/txt.py index 496ee9c03f..de0421cb74 100644 --- a/src/calibre/ebooks/metadata/txt.py +++ b/src/calibre/ebooks/metadata/txt.py @@ -31,7 +31,7 @@ def get_metadata(stream, extract_cover=True): mdata = mdata[:1024] - mo = re.search('(?u)^[ ]*(?P.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata) + mo = re.search(r'(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata) if mo is not None: mi.title = mo.group('title') mi.authors = mo.group('author').split(',') diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index dfdc3af089..f81fee5cc2 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -393,8 +393,7 @@ class MobiReader: self.processed_html = self.processed_html.replace('</html>', '') def remove_random_bytes(self, html): - return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07', - '', html) + return re.sub(r'\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07', '', html) def ensure_unit(self, raw, unit='px'): if re.search(r'\d+$', raw) is not None: diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index 0bc2f598fa..3734ed1c21 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -1340,7 +1340,7 @@ class EpubContainer(Container): break if raw_unique_identifier is not None: idpf_key = raw_unique_identifier - idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key) + idpf_key = re.sub(r'[\u0020\u0009\u000d\u000a]', '', idpf_key) idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest() return package_id, raw_unique_identifier, idpf_key diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py index f479862ad5..c57225dbac 100644 --- a/src/calibre/ebooks/pdb/ereader/reader132.py +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -124,8 +124,8 @@ class Reader132(FormatReader): if self.header_record.footnote_count > 0: html += '<br /><h1>%s</h1>' % _('Footnotes') - footnoteids = re.findall( - '\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) + footnoteids = re.findall(r'\w+(?=\x00)', + self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)): self.log.debug('Extracting footnote page %i' % i) if fid < len(footnoteids): @@ -136,8 +136,8 @@ class Reader132(FormatReader): if self.header_record.sidebar_count > 0: html += '<br /><h1>%s</h1>' % _('Sidebar') - sidebarids = re.findall( - '\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) + sidebarids = re.findall(r'\w+(?=\x00)', + self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)): self.log.debug('Extracting sidebar page %i' % i) if sid < len(sidebarids): diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index 4c990abeae..e6252210e9 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -32,7 +32,7 @@ class PdbHeaderReader: def name(self): self.stream.seek(0) - return re.sub(b'[^-A-Za-z0-9 ]+', b'_', self.stream.read(32).replace(b'\x00', b'')) + return re.sub(br'[^-A-Za-z0-9 ]+', b'_', self.stream.read(32).replace(b'\x00', b'')) def full_section_info(self, number): if not (0 <= number < self.num_sections): @@ -70,7 +70,7 @@ class PdbHeaderBuilder: self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8') if isinstance(title, str): title = title.encode('ascii', 'replace') - self.title = b'%s\x00' % re.sub(b'[^-A-Za-z0-9 ]+', b'_', title).ljust(31, b'\x00')[:31] + self.title = b'%s\x00' % re.sub(br'[^-A-Za-z0-9 ]+', b'_', title).ljust(31, b'\x00')[:31] def build_header(self, section_lengths, out_stream): ''' diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 7f0aa9d12f..9324d3fb24 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -94,7 +94,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I) raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I) raw = xml_replace_entities(raw) - raw = re.sub('[\u00a0\u2029]', ' ', raw) + raw = re.sub(r'[\u00a0\u2029]', ' ', raw) i.write(raw.encode('utf-8')) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index b787010ba4..37bda0735a 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -196,7 +196,7 @@ class PMLMLizer: # Turn all characters that cannot be represented by themself into their # PML code equivalent - text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text) + text = re.sub(r'[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text) # Remove excess spaces at beginning and end of lines text = re.sub(r'(?m)^[ ]+', '', text) @@ -209,14 +209,14 @@ class PMLMLizer: text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text) # Remove excessive newlines. - text = re.sub('\n[ ]+\n', '\n\n', text) + text = re.sub(r'\n[ ]+\n', '\n\n', text) if self.opts.remove_paragraph_spacing: - text = re.sub('\n{2,}', '\n', text) + text = re.sub(r'\n{2,}', '\n', text) # Only indent lines that don't have special formatting text = re.sub(r'(?imu)^(?P<text>.+)$', lambda mo: mo.group('text') if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text) else: - text = re.sub('\n{3,}', '\n\n', text) + text = re.sub(r'\n{3,}', '\n\n', text) return text diff --git a/src/calibre/ebooks/readability/readability.py b/src/calibre/ebooks/readability/readability.py index dc4e7f0225..38760af55d 100644 --- a/src/calibre/ebooks/readability/readability.py +++ b/src/calibre/ebooks/readability/readability.py @@ -61,8 +61,8 @@ def to_int(x): def clean(text): - text = re.sub('\\s*\n\\s*', '\n', text) - text = re.sub('[ \t]{2,}', ' ', text) + text = re.sub(r'\s*\n\s*', '\n', text) + text = re.sub(r'[ \t]{2,}', ' ', text) return text.strip() diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index 2559fb516a..73e872d046 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -199,7 +199,7 @@ class RTFMLizer: # Remove excessive spaces text = re.sub(r'[ ]{2,}', ' ', text) - text = re.sub('\t{2,}', '\t', text) + text = re.sub(r'\t{2,}', '\t', text) text = text.replace('\t ', '\t') # Remove excessive line breaks diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 26b6dfe799..884e589033 100644 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -719,7 +719,7 @@ class ProcessTokens: def divide_num(self, numerator, denominator): try: # calibre why ignore negative number? Wrong in case of \fi - numerator = float(re.search('[0-9.\\-]+', numerator).group()) + numerator = float(re.search(r'[0-9.\-]+', numerator).group()) except TypeError: if self.__run_level > 3: msg = ('No number to process?\nthis indicates that the token \\(\\li\\) \ diff --git a/src/calibre/ebooks/snb/snbml.py b/src/calibre/ebooks/snb/snbml.py index 54da31fcbb..43534bddb3 100644 --- a/src/calibre/ebooks/snb/snbml.py +++ b/src/calibre/ebooks/snb/snbml.py @@ -162,12 +162,12 @@ class SNBMLizer: # text = re.sub('[ ]{2,}', ' ', text) # Remove excessive newlines. - text = re.sub('\n[ ]+\n', '\n\n', text) + text = re.sub(r'\n[ ]+\n', '\n\n', text) if self.opts.remove_paragraph_spacing: - text = re.sub('\n{2,}', '\n', text) + text = re.sub(r'\n{2,}', '\n', text) text = re.sub(r'(?imu)^(?=.)', '\t', text) else: - text = re.sub('\n{3,}', '\n\n', text) + text = re.sub(r'\n{3,}', '\n\n', text) # Replace spaces at the beginning and end of lines text = re.sub(r'(?imu)^[ ]+', '', text) diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 0e8d7512c8..65684a9e20 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -58,7 +58,7 @@ class MarkdownMLizer(OEB2HTML): # Remove tabs that aren't at the beginning of a line new_text = [] for l in text.splitlines(): - start = re.match('\t+', l) + start = re.match(r'\t+', l) if start: start = start.group() else: @@ -71,7 +71,7 @@ class MarkdownMLizer(OEB2HTML): text = re.sub(r'(?msu)^[ ]+$', '', text) # Reduce blank lines - text = re.sub('(?msu)\n{7,}', '\n' * 6, text) + text = re.sub(r'(?msu)\n{7,}', '\n' * 6, text) # Remove blank lines at beginning and end of document. text = re.sub(r'^\s*', '', text) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index c818cfe2f2..5af1171685 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -31,7 +31,7 @@ def clean_txt(txt): txt = '\n'.join([line.rstrip() for line in txt.splitlines()]) # Replace whitespace at the beginning of the line with   - txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt) + txt = re.sub(r'(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt) # Condense redundant spaces txt = re.sub(r'[ ]{2,}', ' ', txt) @@ -40,7 +40,7 @@ def clean_txt(txt): txt = re.sub(r'^\s+(?=.)', '', txt) txt = re.sub(r'(?<=.)\s+$', '', txt) # Remove excessive line breaks. - txt = re.sub('\n{5,}', '\n\n\n\n', txt) + txt = re.sub(r'\n{5,}', '\n\n\n\n', txt) # remove ASCII invalid chars : 0 to 8 and 11-14 to 24 txt = clean_ascii_chars(txt) @@ -190,7 +190,7 @@ def separate_paragraphs_single_line(txt): def separate_paragraphs_print_formatted(txt): - txt = re.sub('(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt) + txt = re.sub(r'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt) return txt diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 36a27366e2..b795f0116e 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -109,16 +109,16 @@ class TextileMLizer(OEB2HTML): # reduce blank lines text = re.sub(r'\n{3}', r'\n\np. \n\n', text) - text = re.sub('%\n(p[<>=]{1,2}\\.|p\\.)', r'%\n\n\1', text) + text = re.sub(r'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text) # Check span following blank para text = re.sub(r'\n+ +%', r' %', text) - text = re.sub('p[<>=]{1,2}\\.\n\n?', r'', text) + text = re.sub(r'p[<>=]{1,2}\.\n\n?', '', text) # blank paragraph text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text) # blank paragraph text = text.replace('\n\xa0', '\np. ') # blank paragraph - text = re.sub('\np[<>=]{1,2}?\\. \xa0', r'\np. ', text) + text = re.sub(r'\np[<>=]{1,2}?\\. \xa0', r'\np. ', text) text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) # sort out spaces in tables diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 9c811f2d77..2209ae495d 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -123,19 +123,19 @@ class TXTMLizer: text = text.replace('\f+', ' ') # Single line paragraph. - text = re.sub('(?<=.)\n(?=.)', ' ', text) + text = re.sub(r'(?<=.)\n(?=.)', ' ', text) # Remove multiple spaces. text = re.sub(r'[ ]{2,}', ' ', text) # Remove excessive newlines. - text = re.sub('\n[ ]+\n', '\n\n', text) + text = re.sub(r'\n[ ]+\n', '\n\n', text) if self.opts.remove_paragraph_spacing: - text = re.sub('\n{2,}', '\n', text) + text = re.sub(r'\n{2,}', '\n', text) text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: '%s\n\n' % mo.group('t'), text) text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '{}\n\n\n\n\n\n{}'.format(mo.group('b'), mo.group('t')), text) else: - text = re.sub('\n{7,}', '\n\n\n\n\n\n', text) + text = re.sub(r'\n{7,}', '\n\n\n\n\n\n', text) # Replace spaces at the beginning and end of lines # We don't replace tabs because those are only added diff --git a/src/calibre/ebooks/unihandecode/jadecoder.py b/src/calibre/ebooks/unihandecode/jadecoder.py index 31fcc11ad5..ed1e05798e 100644 --- a/src/calibre/ebooks/unihandecode/jadecoder.py +++ b/src/calibre/ebooks/unihandecode/jadecoder.py @@ -87,4 +87,4 @@ class Jadecoder(Unidecoder): text = self.conv.do(text) except Exception: pass - return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text) + return re.sub(r'[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text) diff --git a/src/calibre/ebooks/unihandecode/unidecoder.py b/src/calibre/ebooks/unihandecode/unidecoder.py index 6d7c9f16f7..aba5069030 100644 --- a/src/calibre/ebooks/unihandecode/unidecoder.py +++ b/src/calibre/ebooks/unihandecode/unidecoder.py @@ -73,7 +73,7 @@ class Unidecoder: def decode(self, text): # Replace characters larger than 127 with their ASCII equivalent. - return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text) + return re.sub(r'[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text) def replace_point(self, codepoint): ''' diff --git a/src/calibre/gui2/fts/search.py b/src/calibre/gui2/fts/search.py index 078d97f5aa..83655a5a74 100644 --- a/src/calibre/gui2/fts/search.py +++ b/src/calibre/gui2/fts/search.py @@ -733,7 +733,7 @@ class ResultDetails(QWidget): def render_results(self, results, individual_match=None): html = [] space_pat = re.compile(r'\s+') - markup_pat = re.compile('\x1d') + markup_pat = re.compile(r'\x1d') def markup_text(text): count = 0 diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index 7166f6d792..707c31ec25 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -433,7 +433,7 @@ def run_gui_(opts, args, app, gui_debug=None): winutil.prepare_for_restart() with open(debugfile, 'r+b') as f: raw = f.read() - raw = re.sub(b'(?<!\r)\n', b'\r\n', raw) + raw = re.sub(br'(?<!\r)\n', br'\r\n', raw) f.seek(0) f.truncate() f.write(raw) diff --git a/src/calibre/gui2/tweak_book/editor/syntax/html.py b/src/calibre/gui2/tweak_book/editor/syntax/html.py index 9ac523474c..ea88c3dfde 100644 --- a/src/calibre/gui2/tweak_book/editor/syntax/html.py +++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py @@ -39,7 +39,7 @@ attribute_name_pat = re.compile(r'''[^%s"'/><=]+''' % space_chars) self_closing_pat = re.compile(r'/\s*>') unquoted_val_pat = re.compile(r'''[^%s'"=<>`]+''' % space_chars) cdata_close_pats = {x:re.compile(r'</%s' % x, flags=re.I) for x in cdata_tags} -nbsp_pat = re.compile('[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+') # special spaces and hyphens +nbsp_pat = re.compile(r'[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+') # special spaces and hyphens NORMAL = 0 IN_OPENING_TAG = 1 diff --git a/src/calibre/library/catalogs/utils.py b/src/calibre/library/catalogs/utils.py index b297db3cf9..9a3a2b6ae8 100644 --- a/src/calibre/library/catalogs/utils.py +++ b/src/calibre/library/catalogs/utils.py @@ -119,7 +119,7 @@ class NumberToText: # {{{ self.text = NumberToText(self.number.replace('%',' percent')).text # Test for decimal - elif re.search('\\.',self.number): + elif '.' in self.number: if self.verbose: self.log('Decimal: %s' % self.number) self.number_as_float = self.number @@ -150,7 +150,7 @@ class NumberToText: # {{{ self.text = NumberToText(self.number_as_float).text # Test for hybrid e.g., 'K2, 2nd, 10@10' - elif re.search('[\\D]+', self.number): + elif re.search(r'[\D]+', self.number): if self.verbose: self.log('Hybrid: %s' % self.number) # Split the token into number/text diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py index 661fe829de..a896928a8b 100644 --- a/src/calibre/library/comments.py +++ b/src/calibre/library/comments.py @@ -11,7 +11,7 @@ from calibre.utils.html2text import html2text # Hackish - ignoring sentences ending or beginning in numbers to avoid # confusion with decimal points. -lost_cr_pat = re.compile('([a-z])([\\.\\?!])([A-Z])') +lost_cr_pat = re.compile(r'([a-z])([\.\?!])([A-Z])') lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])') sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe', re.IGNORECASE) diff --git a/src/calibre/library/custom_columns.py b/src/calibre/library/custom_columns.py index 7d6fe96593..85043bc0f4 100644 --- a/src/calibre/library/custom_columns.py +++ b/src/calibre/library/custom_columns.py @@ -674,7 +674,7 @@ class CustomColumns: editable=True, display={}): if not label: raise ValueError(_('No label was provided')) - if re.match('^\\w*$', label) is None or not label[0].isalpha() or label.lower() != label: + if re.match(r'^\w*$', label) is None or not label[0].isalpha() or label.lower() != label: raise ValueError(_('The label must contain only lower case letters, digits and underscores, and start with a letter')) if datatype not in self.CUSTOM_DATA_TYPES: raise ValueError('%r is not a supported data type'%datatype) diff --git a/src/calibre/spell/dictionary.py b/src/calibre/spell/dictionary.py index a26f57e7f6..27dff56061 100644 --- a/src/calibre/spell/dictionary.py +++ b/src/calibre/spell/dictionary.py @@ -193,7 +193,7 @@ def load_dictionary(dictionary): class Dictionaries: def __init__(self): - self.remove_hyphenation = re.compile('[\u2010-]+') + self.remove_hyphenation = re.compile(r'[\u2010-]+') self.negative_pat = re.compile(r'-[.\d+]') self.fix_punctuation_pat = re.compile(r'''[:.]''') self.dictionaries = {} diff --git a/src/calibre/utils/bibtex.py b/src/calibre/utils/bibtex.py index e3f48f84c4..71a2076d39 100644 --- a/src/calibre/utils/bibtex.py +++ b/src/calibre/utils/bibtex.py @@ -2553,7 +2553,7 @@ class BibTeX: self.ascii_bibtex = True # This substitution is based on the description of cite key restrictions at # http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html - self.invalid_cit = re.compile('[ "@\',\\#}{~%&$^]') + self.invalid_cit = re.compile(r'[ "@\',\#}{~%&$^]') self.upper = re.compile('[' + string.ascii_uppercase + ']') self.escape = re.compile(r'[#&%_]') diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index 394ca9eeba..47b2f4d745 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -94,4 +94,4 @@ def unescape(text, rm=False, rchar=''): if rm: return rchar # replace by char return text # leave as is - return re.sub('&#?\\w+;', fixup, text) + return re.sub(r'&#?\w+;', fixup, text) diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 82e9da36fa..518040f610 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -545,7 +545,7 @@ class RecursiveFetcher: dsrc = self.fetch_url(iurl) newbaseurl = dsrc.newurl if len(dsrc) == 0 or \ - len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0: + len(re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0: raise ValueError('No content at URL %r'%iurl) if callable(self.encoding): dsrc = self.encoding(dsrc) diff --git a/src/odf/easyliststyle.py b/src/odf/easyliststyle.py index 81dcc2cc7a..9b23d1cf17 100644 --- a/src/odf/easyliststyle.py +++ b/src/odf/easyliststyle.py @@ -60,7 +60,7 @@ def styleFromList(styleName, specArray, spacing, showAllLevels): displayLevels = 0 listStyle = ListStyle(name=styleName) numFormatPattern = re.compile(r'([1IiAa])') - cssLengthPattern = re.compile('([^a-z]+)\\s*([a-z]+)?') + cssLengthPattern = re.compile(r'([^a-z]+)\s*([a-z]+)?') m = cssLengthPattern.search(spacing) if (m is not None): cssLengthNum = float(m.group(1))