always use raw-string for regex (manual)

ruff 'RUF039'
This commit is contained in:
un-pogaz 2025-01-24 11:14:20 +01:00
parent ac6912565a
commit 3720de10d2
70 changed files with 136 additions and 142 deletions

View File

@ -16,5 +16,5 @@ class Android_com_pl(BasicNewsRecipe):
remove_tags_after = [{'class': 'post-content'}] remove_tags_after = [{'class': 'post-content'}]
remove_tags = [dict(name='ul', attrs={'class': 'tags small-tags'}), dict(name='a', attrs={'onclick': 'return ss_plugin_loadpopup_js(this);'})] remove_tags = [dict(name='ul', attrs={'class': 'tags small-tags'}), dict(name='a', attrs={'onclick': 'return ss_plugin_loadpopup_js(this);'})]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(u'<p>.{,1}</p>', re.DOTALL), lambda match: '')] (re.compile(r'<p>.{,1}</p>', re.DOTALL), lambda match: '')]
feeds = [(u'Android', u'http://android.com.pl/feed/')] feeds = [(u'Android', u'http://android.com.pl/feed/')]

View File

@ -17,8 +17,8 @@ class BenchmarkPl(BasicNewsRecipe):
extra_css = 'ul {list-style-type: none;}' extra_css = 'ul {list-style-type: none;}'
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
preprocess_regexps = [(re.compile(u'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', # noqa: E501 preprocess_regexps = [(re.compile(u'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', # noqa: E501, RUF039
re.DOTALL | re.IGNORECASE), lambda match: '</body>'), (re.compile(u'Więcej o .*?</ul>', re.DOTALL | re.IGNORECASE), lambda match: '')] re.DOTALL | re.IGNORECASE), lambda match: '</body>'), (re.compile(u'Więcej o .*?</ul>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: RUF039
keep_only_tags = [dict(id=['articleHeader', 'articleGallery']), dict( keep_only_tags = [dict(id=['articleHeader', 'articleGallery']), dict(
name='div', attrs={'class': ['m_zwykly', 'gallery']}), dict(id='article')] name='div', attrs={'class': ['m_zwykly', 'gallery']}), dict(id='article')]

View File

@ -41,7 +41,7 @@ class Blic(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags_before = dict(name='div', attrs={'id': 'article_info'}) remove_tags_before = dict(name='div', attrs={'id': 'article_info'})
remove_tags = [ remove_tags = [
dict(name=['object', 'link', 'meta', 'base', 'object', 'embed'])] dict(name=['object', 'link', 'meta', 'base', 'object', 'embed'])]

View File

@ -27,7 +27,7 @@ class CveceZla(BasicNewsRecipe):
'comment': description, 'tags': 'igre, muzika, film, blog, Srbija', 'publisher': 'Mehmet Krljic', 'language': language 'comment': description, 'tags': 'igre, muzika, film, blog, Srbija', 'publisher': 'Mehmet Krljic', 'language': language
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags_before = dict(attrs={'class': 'navigation'}) remove_tags_before = dict(attrs={'class': 'navigation'})
remove_tags_after = dict(attrs={'class': 'commentlist'}) remove_tags_after = dict(attrs={'class': 'commentlist'})

View File

@ -61,7 +61,7 @@ class MediaDaumRecipe(BasicNewsRecipe):
lambda match: '<em>'), lambda match: '<em>'),
(re.compile(r'<i>(<br[^>]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE), (re.compile(r'<i>(<br[^>]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: '<i>'), lambda match: '<i>'),
(re.compile(u'(<br[^>]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\\(c\\))*\\[[^\\]]*(\u24D2|\\(c\\)|\uAE30\uC0AC|\uC778\uAE30[^\\]]*\uB274\uC2A4)[^\\]]*\\].*</div>', re.DOTALL | re.IGNORECASE), # noqa: E501 (re.compile(u'(<br[^>]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\\(c\\))*\\[[^\\]]*(\u24D2|\\(c\\)|\uAE30\uC0AC|\uC778\uAE30[^\\]]*\uB274\uC2A4)[^\\]]*\\].*</div>', re.DOTALL | re.IGNORECASE), # noqa: E501, RUF039
lambda match: '</div>'), lambda match: '</div>'),
] ]

View File

@ -42,7 +42,7 @@ class DnevnikCro(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
keep_only_tags = [dict(name='div', attrs={'id': 'article'})] keep_only_tags = [dict(name='div', attrs={'id': 'article'})]

View File

@ -20,8 +20,8 @@ class DziennikWschodni(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(u'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa: E501 preprocess_regexps = [(re.compile(u'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa: E501, RUF039
(re.compile(u'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: E501 (re.compile(u'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: E501, RUF039
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',

View File

@ -45,9 +45,7 @@ class Esensja(BasicNewsRecipe):
''' '''
preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''),
(re.compile( (re.compile(r'(title|alt)="[^"]*?"', re.DOTALL), lambda match: '')]
u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''),
]
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('http://www.esensja.pl/magazyn/') soup = self.index_to_soup('http://www.esensja.pl/magazyn/')

View File

@ -23,9 +23,7 @@ class EsensjaRSS(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''),
(re.compile( (re.compile(r'(title|alt)="[^"]*?"', re.DOTALL), lambda match: '')]
u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''),
]
remove_attributes = ['style', 'bgcolor', 'alt', 'color'] remove_attributes = ['style', 'bgcolor', 'alt', 'color']
keep_only_tags = [dict(attrs={'class': 'sekcja'}), ] keep_only_tags = [dict(attrs={'class': 'sekcja'}), ]
remove_tags_after = dict(id='tekst') remove_tags_after = dict(id='tekst')

View File

@ -23,7 +23,7 @@ class FilmWebPl(BasicNewsRecipe):
'ul.sep-line > li + li::before {content: " | "} ' 'ul.sep-line > li + li::before {content: " | "} '
'ul.inline {padding:0px;} .vertical-align {display: inline-block;}') 'ul.inline {padding:0px;} .vertical-align {display: inline-block;}')
preprocess_regexps = [(re.compile(r'<body.+?</head>', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags... preprocess_regexps = [(re.compile(r'<body.+?</head>', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags...
(re.compile(u'(?:<sup>)?\\(kliknij\\,\\ aby powiększyć\\)(?:</sup>)?', re.IGNORECASE), lambda m: ''), (re.compile(u'(?:<sup>)?\\(kliknij\\,\\ aby powiększyć\\)(?:</sup>)?', re.IGNORECASE), lambda m: ''), # noqa: RUF039
(re.compile(type(u'')(r'(<br ?/?>\s*?<br ?/?>\s*?)+'), re.IGNORECASE), lambda m: '<br />') (re.compile(type(u'')(r'(<br ?/?>\s*?<br ?/?>\s*?)+'), re.IGNORECASE), lambda m: '<br />')
] ]
remove_tags = [dict(attrs={'class':['infoParent', 'likeBar', remove_tags = [dict(attrs={'class':['infoParent', 'likeBar',

View File

@ -34,7 +34,7 @@ class gw_krakow(BasicNewsRecipe):
# rules for gazeta.pl # rules for gazeta.pl
preprocess_regexps = [ preprocess_regexps = [
(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')] (re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')] # noqa: RUF039
keep_only_tags = [dict(id='gazeta_article')] keep_only_tags = [dict(id='gazeta_article')]
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict( remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(
attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]

View File

@ -33,7 +33,7 @@ class gw_wawa(BasicNewsRecipe):
# rules for gazeta.pl # rules for gazeta.pl
preprocess_regexps = [ preprocess_regexps = [
(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')] (re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')] # noqa: RUF039
keep_only_tags = [dict(id='gazeta_article')] keep_only_tags = [dict(id='gazeta_article')]
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict( remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(
attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]

View File

@ -54,11 +54,11 @@ def solve_captcha(captcha):
# Parse into parts # Parse into parts
pattern = re.compile( pattern = re.compile(
u'(?P<first_component>[0-9]+)?' r'(?P<first_component>[0-9]+)?'
u'\\s*(?P<operator>[+×])\\s*' u'\\s*(?P<operator>[+×])\\s*' # noqa: RUF039
u'(?P<second_component>[0-9]+)' r'(?P<second_component>[0-9]+)'
u'\\s*(=)\\s*' r'\s*(=)\s*'
u'(?P<result>[0-9]+)?', re.UNICODE) r'(?P<result>[0-9]+)?', re.UNICODE)
calculationParts = re.search(pattern, numeric_problem) calculationParts = re.search(pattern, numeric_problem)
if calculationParts is None: if calculationParts is None:

View File

@ -16,7 +16,7 @@ class in4(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True
preprocess_regexps = [ preprocess_regexps = [
(re.compile(u'<a title="translate into.*?</a>', re.DOTALL), lambda match: '')] (re.compile(r'<a title="translate into.*?</a>', re.DOTALL), lambda match: '')]
keep_only_tags = [dict(name='div', attrs={'class': 'left_alone'})] keep_only_tags = [dict(name='div', attrs={'class': 'left_alone'})]
remove_tags_after = dict(name='img', attrs={'title': 'komentarze'}) remove_tags_after = dict(name='img', attrs={'title': 'komentarze'})
remove_tags = [dict(name='img', attrs={'title': 'komentarze'})] remove_tags = [dict(name='img', attrs={'title': 'komentarze'})]

View File

@ -29,9 +29,9 @@ class KopalniaWiedzy(BasicNewsRecipe):
extra_css = '.topimage {margin-top: 30px}' extra_css = '.topimage {margin-top: 30px}'
preprocess_regexps = [ preprocess_regexps = [
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'), (re.compile(r'<a .* rel="lightboxText" .*><img (.*)></a>'),
lambda match: '<img class="topimage" ' + match.group(1) + '>'), lambda match: '<img class="topimage" ' + match.group(1) + '>'),
(re.compile(u'<br /><br />'), (re.compile(r'<br /><br />'),
lambda match: '<br/>') lambda match: '<br/>')
] ]

View File

@ -66,7 +66,7 @@ class LeMondeAbonne(BasicNewsRecipe):
dict(name='div', attrs={'class': 'po-copy'}) dict(name='div', attrs={'class': 'po-copy'})
] ]
article_id_pattern = re.compile('[0-9]+\\.html') article_id_pattern = re.compile(r'[0-9]+\.html')
article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/' article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/'
def get_browser(self): def get_browser(self):

View File

@ -43,7 +43,7 @@ class NacionalCro(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags = [dict(name=['object', 'link', 'embed'])] remove_tags = [dict(name=['object', 'link', 'embed'])]

View File

@ -11,8 +11,8 @@ class NaTemat(BasicNewsRecipe):
description = u'informacje, komentarze, opinie' description = u'informacje, komentarze, opinie'
category = 'news' category = 'news'
language = 'pl' language = 'pl'
preprocess_regexps = [(re.compile(u'Czytaj też\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Zobacz też\\:.*?</a>', re.IGNORECASE), lambda m: ''), # noqa: E501 preprocess_regexps = [(re.compile(u'Czytaj też\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Zobacz też\\:.*?</a>', re.IGNORECASE), lambda m: ''), # noqa: E501, RUF039
(re.compile(u'Czytaj więcej\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Czytaj również\\:.*?</a>', re.IGNORECASE), lambda m: '')] # noqa: E501 (re.compile(u'Czytaj więcej\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Czytaj również\\:.*?</a>', re.IGNORECASE), lambda m: '')] # noqa: E501, RUF039
cover_url = 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png' cover_url = 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png'
no_stylesheets = True no_stylesheets = True
keep_only_tags = [dict(id='main')] keep_only_tags = [dict(id='main')]

View File

@ -34,7 +34,7 @@ class NjuzNet(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language 'comment': description, 'tags': category, 'publisher': publisher, 'language': language
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
keep_only_tags = [ keep_only_tags = [
dict(attrs={'id': 'entryMeta'}), dict(attrs={'class': 'post'}) dict(attrs={'id': 'entryMeta'}), dict(attrs={'class': 'post'})

View File

@ -36,7 +36,7 @@ class NoviList_hr(BasicNewsRecipe):
p{display: block} p{display: block}
''' '''
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
conversion_options = { conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True

View File

@ -35,7 +35,7 @@ class Novosti(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'pretty_print': True 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'pretty_print': True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
keep_only_tags = [dict(attrs={'class': [ keep_only_tags = [dict(attrs={'class': [
'articleTitle', 'articleInfo', 'articleLead', 'singlePhoto fl', 'articleBody']})] 'articleTitle', 'articleInfo', 'articleLead', 'singlePhoto fl', 'articleBody']})]

View File

@ -45,7 +45,7 @@ class Nspm(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'pretty_print': True 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'pretty_print': True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags = [dict(name=['link', 'script', 'meta', 'base', 'img'])] remove_tags = [dict(name=['link', 'script', 'meta', 'base', 'img'])]
remove_attributes = ['width', 'height', 'lang', 'xmlns:fb', remove_attributes = ['width', 'height', 'lang', 'xmlns:fb',
'xmlns:og', 'vspace', 'hspace', 'type', 'start', 'size'] 'xmlns:og', 'vspace', 'hspace', 'type', 'start', 'size']

View File

@ -31,8 +31,7 @@ class ObservatorulCultural(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
soup = self.index_to_soup( soup = self.index_to_soup(
'http://www.observatorcultural.ro/Arhiva*-archive.html') 'http://www.observatorcultural.ro/Arhiva*-archive.html')
issueTag = soup.find('a', href=re.compile( issueTag = soup.find('a', href=re.compile(r'observatorcultural.ro/Numarul'))
'observatorcultural.ro\\/Numarul'))
issueURL = issueTag['href'] issueURL = issueTag['href']
print(issueURL) print(issueURL)
issueSoup = self.index_to_soup(issueURL) issueSoup = self.index_to_soup(issueURL)

View File

@ -34,7 +34,7 @@ class Pescanik(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language 'comment': description, 'tags': category, 'publisher': publisher, 'language': language
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags = [ remove_tags = [
dict(name=['object', 'link', 'meta', 'script', 'iframe', 'embed'])] dict(name=['object', 'link', 'meta', 'script', 'iframe', 'embed'])]
keep_only_tags = [ keep_only_tags = [

View File

@ -31,9 +31,9 @@ class AdvancedUserRecipe1289939440(BasicNewsRecipe):
] ]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(u'<p class="perex[^"]*">[^<]*<img[^>]*>', (re.compile(r'<p class="perex[^"]*">[^<]*<img[^>]*>',
re.DOTALL), lambda match: '<p class="intro">'), re.DOTALL), lambda match: '<p class="intro">'),
(re.compile(u'<h3><a name="tucnak">Tričko tučňák.*</body>', (re.compile(u'<h3><a name="tucnak">Tričko tučňák.*</body>', # noqa: RUF039
re.DOTALL), lambda match: '<!--deleted-->') re.DOTALL), lambda match: '<!--deleted-->')
] ]

View File

@ -38,7 +38,7 @@ class RTS(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
feeds = [ feeds = [

View File

@ -25,4 +25,4 @@ class swiatczytnikow(BasicNewsRecipe):
dict(name='div', attrs={'class': 'feedflare'})] dict(name='div', attrs={'class': 'feedflare'})]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(u'<h3>Czytaj dalej:</h3>'), lambda match: '')] (re.compile(u'<h3>Czytaj dalej:</h3>'), lambda match: '')] # noqa: RUF039

View File

@ -15,8 +15,8 @@ class Tablety_pl(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
preprocess_regexps = [(re.compile(u'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), preprocess_regexps = [(re.compile(u'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), # noqa: RUF039
(re.compile(u'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')] (re.compile(u'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')] # noqa: RUF039
keep_only_tags = [dict(attrs={'class': ['featured-image', 'article-content clearfix']})] keep_only_tags = [dict(attrs={'class': ['featured-image', 'article-content clearfix']})]
remove_tags = [dict(attrs={'class': ['comments_icon', 'wp-polls', 'entry-comments', remove_tags = [dict(attrs={'class': ['comments_icon', 'wp-polls', 'entry-comments',
'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer', 'social-custom']})] 'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer', 'social-custom']})]

View File

@ -27,7 +27,7 @@ class TheCultOfGhoul(BasicNewsRecipe):
'comment': description, 'tags': 'film, blog, srbija, strava, uzas', 'publisher': 'Dejan Ognjanovic', 'language': language 'comment': description, 'tags': 'film, blog, srbija, strava, uzas', 'publisher': 'Dejan Ognjanovic', 'language': language
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
feeds = [(u'Posts', u'http://cultofghoul.blogspot.com/feeds/posts/default')] feeds = [(u'Posts', u'http://cultofghoul.blogspot.com/feeds/posts/default')]

View File

@ -43,7 +43,7 @@ class VecernjiList(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags = [ remove_tags = [
dict(name=['object', 'link', 'embed']), dict( dict(name=['object', 'link', 'embed']), dict(

View File

@ -41,7 +41,7 @@ class Vreme(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags_before = dict(attrs={'class': 'toc-heading'}) remove_tags_before = dict(attrs={'class': 'toc-heading'})
remove_tags_after = dict(attrs={'class': 'footer'}) remove_tags_after = dict(attrs={'class': 'footer'})

View File

@ -10,8 +10,8 @@ class WNP(BasicNewsRecipe):
description = u'Wirtualny Nowy Przemysł' description = u'Wirtualny Nowy Przemysł'
category = 'economy' category = 'economy'
language = 'pl' language = 'pl'
preprocess_regexps = [(re.compile(u'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''), preprocess_regexps = [(re.compile(u'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''), # noqa: RUF039
(re.compile(u'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')] (re.compile(u'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')] # noqa: RUF039
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True

View File

@ -48,7 +48,7 @@ class ZeitEPUBAbo(BasicNewsRecipe):
preprocess_regexps = [ preprocess_regexps = [
# filtering for correct dashes ("Gedankenstrich" and "bis") # filtering for correct dashes ("Gedankenstrich" and "bis")
(re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'), (re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'), # noqa: RUF039
(re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number (re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number
(re.compile(r'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro (re.compile(r'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro
# fix the number dash number dash for the title image that was broken # fix the number dash number dash for the title image that was broken
@ -130,9 +130,9 @@ class ZeitEPUBAbo(BasicNewsRecipe):
(re.compile( (re.compile(
r'(?<=<p class="absatz">[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''), r'(?<=<p class="absatz">[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''),
# before closing quotation # before closing quotation
(re.compile(u' \u00AB'), lambda match: u'\u00AB '), (re.compile(u' \u00AB'), lambda match: u'\u00AB '), # noqa: RUF039
# after opening quotation # after opening quotation
(re.compile(u'\u00BB '), lambda match: u' \u00BB'), (re.compile(u'\u00BB '), lambda match: u' \u00BB'), # noqa: RUF039
# filtering for spaces in large numbers for better readability # filtering for spaces in large numbers for better readability
# end of the number with some character following # end of the number with some character following
(re.compile(r'(?<=\d\d)(?=\d\d\d[ ,;\)<\?!-])'), (re.compile(r'(?<=\d\d)(?=\d\d\d[ ,;\)<\?!-])'),
@ -151,25 +151,25 @@ class ZeitEPUBAbo(BasicNewsRecipe):
# filtering for unicode characters that are missing on the Kindle, # filtering for unicode characters that are missing on the Kindle,
# try to replace them with meaningful work-arounds # try to replace them with meaningful work-arounds
# subscript-0 # subscript-0
(re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), (re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), # noqa: RUF039
# subscript-1 # subscript-1
(re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), (re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), # noqa: RUF039
# subscript-2 # subscript-2
(re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), (re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), # noqa: RUF039
# subscript-3 # subscript-3
(re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), (re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), # noqa: RUF039
# subscript-4 # subscript-4
(re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), (re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), # noqa: RUF039
# subscript-5 # subscript-5
(re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), (re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), # noqa: RUF039
# subscript-6 # subscript-6
(re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), (re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), # noqa: RUF039
# subscript-7 # subscript-7
(re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), (re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), # noqa: RUF039
# subscript-8 # subscript-8
(re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), (re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), # noqa: RUF039
# subscript-9 # subscript-9
(re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), (re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), # noqa: RUF039
# always chance CO2 # always chance CO2
(re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'), # CO2 (re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'), # CO2
# remove *** paragraphs # remove *** paragraphs

View File

@ -21,7 +21,7 @@ class PagebreakPageGenerator(IPageGenerator):
''' Determine pages based on the presence of <*pagebreak*/>. ''' ''' Determine pages based on the presence of <*pagebreak*/>. '''
html = mobi_html(mobi_file_path) html = mobi_html(mobi_file_path)
pages = [] pages = []
for m in re.finditer(b'<[^>]*pagebreak[^>]*>', html): for m in re.finditer(br'<[^>]*pagebreak[^>]*>', html):
pages.append(m.end()) pages.append(m.end())
return Pages(pages) return Pages(pages)

View File

@ -32,7 +32,7 @@ class TCRCompressor:
The intent is to create more unused codes. The intent is to create more unused codes.
''' '''
possible_codes = [] possible_codes = []
a_code = set(re.findall(b'(?ms).', self.coded_txt)) a_code = set(re.findall(br'(?ms).', self.coded_txt))
for code in a_code: for code in a_code:
single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt)) single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt))
@ -57,7 +57,7 @@ class TCRCompressor:
''' '''
Create new codes from codes that occur in pairs often. Create new codes from codes that occur in pairs often.
''' '''
possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt))) possible_new_codes = list(set(re.findall(br'(?ms)..', self.coded_txt)))
new_codes_count = [] new_codes_count = []
for c in possible_new_codes: for c in possible_new_codes:
@ -74,7 +74,7 @@ class TCRCompressor:
def compress(self, txt): def compress(self, txt):
self._reset() self._reset()
self.codes = list(set(re.findall(b'(?ms).', txt))) self.codes = list(set(re.findall(br'(?ms).', txt)))
# Replace the text with their corresponding code # Replace the text with their corresponding code
# FIXME: python3 is native bytearray, but all we want are bytes # FIXME: python3 is native bytearray, but all we want are bytes

View File

@ -46,7 +46,7 @@ class EPUBInput(InputFormatPlugin):
from lxml import etree from lxml import etree
idpf_key = opf.raw_unique_identifier idpf_key = opf.raw_unique_identifier
if idpf_key: if idpf_key:
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key) idpf_key = re.sub(r'[\u0020\u0009\u000d\u000a]', '', idpf_key)
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest() idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
key = None key = None
for item in opf.identifier_iter(): for item in opf.identifier_iter():

View File

@ -503,7 +503,7 @@ class EPUBOutput(OutputFormatPlugin):
tag.tag = XHTML('div') tag.tag = XHTML('div')
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
special_chars = re.compile('[\u200b\u00ad]') special_chars = re.compile(r'[\u200b\u00ad]')
for elem in root.iterdescendants('*'): for elem in root.iterdescendants('*'):
if elem.text: if elem.text:
elem.text = special_chars.sub('', elem.text) elem.text = special_chars.sub('', elem.text)

View File

@ -296,7 +296,7 @@ class RTFInput(InputFormatPlugin):
res = as_bytes(transform.tostring(result)) res = as_bytes(transform.tostring(result))
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# clean multiple \n # clean multiple \n
res = re.sub(b'\n+', b'\n', res) res = re.sub(br'\n+', b'\n', res)
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
# res = re.sub('\s*<body>', '<body>', res) # res = re.sub('\s*<body>', '<body>', res)
# res = re.sub('(?<=\n)\n{2}', # res = re.sub('(?<=\n)\n{2}',

View File

@ -94,7 +94,7 @@ class DocAnalysis:
elif format == 'spanned_html': elif format == 'spanned_html':
linere = re.compile(r'(?<=<span).*?(?=</span>)', re.DOTALL) linere = re.compile(r'(?<=<span).*?(?=</span>)', re.DOTALL)
elif format == 'txt': elif format == 'txt':
linere = re.compile('.*?\n') linere = re.compile(r'.*?\n')
self.lines = linere.findall(raw) self.lines = linere.findall(raw)
def line_length(self, percent): def line_length(self, percent):

View File

@ -57,8 +57,8 @@ class HeuristicProcessor:
' chapters. - ' + str(chap)) ' chapters. - ' + str(chap))
return '<h2>'+chap+'</h2>\n' return '<h2>'+chap+'</h2>\n'
else: else:
delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$') delete_whitespace = re.compile(r'^\s*(?P<c>.*?)\s*$')
delete_quotes = re.compile('\'"') delete_quotes = re.compile(r'\'"')
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap))) txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title))) txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
self.html_preprocess_sections = self.html_preprocess_sections + 1 self.html_preprocess_sections = self.html_preprocess_sections + 1
@ -109,7 +109,7 @@ class HeuristicProcessor:
be marked up to return true. be marked up to return true.
''' '''
htm_end_ere = re.compile(r'</(p|div)>', re.DOTALL) htm_end_ere = re.compile(r'</(p|div)>', re.DOTALL)
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) line_end_ere = re.compile(r'(\n|\r|\r\n)', re.DOTALL)
htm_end = htm_end_ere.findall(raw) htm_end = htm_end_ere.findall(raw)
line_end = line_end_ere.findall(raw) line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end) tot_htm_ends = len(htm_end)
@ -417,7 +417,7 @@ class HeuristicProcessor:
# Add markup naively # Add markup naively
# TODO - find out if there are cases where there are more than one <pre> tag or # TODO - find out if there are cases where there are more than one <pre> tag or
# other types of unmarked html and handle them in some better fashion # other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?<!>)(\n)') add_markup = re.compile(r'(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html) html = add_markup.sub('</p>\n<p>', html)
return html return html
@ -440,7 +440,7 @@ class HeuristicProcessor:
# Get rid of empty <o:p> tags to simplify other processing # Get rid of empty <o:p> tags to simplify other processing
html = re.sub(r'\s*<o:p>\s*</o:p>', ' ', html) html = re.sub(r'\s*<o:p>\s*</o:p>', ' ', html)
# Delete microsoft 'smart' tags # Delete microsoft 'smart' tags
html = re.sub('(?i)</?st1:\\w+>', '', html) html = re.sub(r'(?i)</?st1:\w+>', '', html)
# Re-open self closing paragraph tags # Re-open self closing paragraph tags
html = re.sub(r'<p[^>/]*/>', '<p> </p>', html) html = re.sub(r'<p[^>/]*/>', '<p> </p>', html)
# Get rid of empty span, bold, font, em, & italics tags # Get rid of empty span, bold, font, em, & italics tags
@ -451,7 +451,7 @@ class HeuristicProcessor:
html = re.sub( html = re.sub(
r'\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}'.format(open=open_fmt_pat, close=close_fmt_pat) , ' ', html) r'\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}'.format(open=open_fmt_pat, close=close_fmt_pat) , ' ', html)
# delete surrounding divs from empty paragraphs # delete surrounding divs from empty paragraphs
html = re.sub('<div[^>]*>\\s*<p[^>]*>\\s*</p>\\s*</div>', '<p> </p>', html) html = re.sub(r'<div[^>]*>\s*<p[^>]*>\s*</p>\s*</div>', '<p> </p>', html)
# Empty heading tags # Empty heading tags
html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html) html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
self.deleted_nbsps = True self.deleted_nbsps = True
@ -538,7 +538,7 @@ class HeuristicProcessor:
elif content.find('scenebreak') != -1: elif content.find('scenebreak') != -1:
return content return content
else: else:
content = re.sub('(?i)<h(?P<hnum>\\d+)[^>]*>', '\n\n<h'+'\\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content) content = re.sub(r'(?i)<h(?P<hnum>\d+)[^>]*>', r'\n\n<h\g<hnum> style="'+top_margin+bottom_margin+'">', content)
return content return content
html = blanks_around_headings.sub(merge_header_whitespace, html) html = blanks_around_headings.sub(merge_header_whitespace, html)
@ -551,7 +551,7 @@ class HeuristicProcessor:
html = blanks_n_nopunct.sub(markup_whitespaces, html) html = blanks_n_nopunct.sub(markup_whitespaces, html)
if self.html_preprocess_sections > self.min_chapters: if self.html_preprocess_sections > self.min_chapters:
html = re.sub('(?si)^.*?(?=<h\\d)', markup_whitespaces, html) html = re.sub(r'(?si)^.*?(?=<h\d)', markup_whitespaces, html)
return html return html
@ -600,13 +600,13 @@ class HeuristicProcessor:
if re.match(r'^<hr', replacement_break): if re.match(r'^<hr', replacement_break):
if replacement_break.find('width') != -1: if replacement_break.find('width') != -1:
try: try:
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break)) width = int(re.sub(r'.*?width(:|=)(?P<wnum>\d+).*', r'\g<wnum>', replacement_break))
except: except:
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>' scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
self.log.warn('Invalid replacement scene break' self.log.warn('Invalid replacement scene break'
' expression, using default') ' expression, using default')
else: else:
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break) replacement_break = re.sub(r'(?i)(width=\d+\\%?|width:\s*\d+(\%|px|pt|em)?;?)', '', replacement_break)
divpercent = (100 - width) // 2 divpercent = (100 - width) // 2
hr_open = re.sub(r'45', str(divpercent), hr_open) hr_open = re.sub(r'45', str(divpercent), hr_open)
scene_break = hr_open+replacement_break+'</div>' scene_break = hr_open+replacement_break+'</div>'
@ -617,24 +617,24 @@ class HeuristicProcessor:
else: else:
from calibre.utils.html2text import html2text from calibre.utils.html2text import html2text
replacement_break = html2text(replacement_break) replacement_break = html2text(replacement_break)
replacement_break = re.sub('\\s', '&nbsp;', replacement_break) replacement_break = re.sub(r'\s', '&nbsp;', replacement_break)
scene_break = self.scene_break_open+replacement_break+'</p>' scene_break = self.scene_break_open+replacement_break+'</p>'
else: else:
replacement_break = re.sub('\\s', '&nbsp;', replacement_break) replacement_break = re.sub(r'\s', '&nbsp;', replacement_break)
scene_break = self.scene_break_open+replacement_break+'</p>' scene_break = self.scene_break_open+replacement_break+'</p>'
return scene_break return scene_break
def check_paragraph(self, content): def check_paragraph(self, content):
content = re.sub('\\s*</?span[^>]*>\\s*', '', content) content = re.sub(r'\s*</?span[^>]*>\s*', '', content)
if re.match('.*["\'.!?:]$', content): if re.match(r'.*["\'.!?:]$', content):
# print('detected this as a paragraph') # print('detected this as a paragraph')
return True return True
else: else:
return False return False
def abbyy_processor(self, html): def abbyy_processor(self, html):
abbyy_line = re.compile('((?P<linestart><p\\sstyle="(?P<styles>[^"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE) abbyy_line = re.compile(r'((?P<linestart><p\sstyle="(?P<styles>[^"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
empty_paragraph = '\n<p> </p>\n' empty_paragraph = '\n<p> </p>\n'
self.in_blockquote = False self.in_blockquote = False
self.previous_was_paragraph = False self.previous_was_paragraph = False
@ -680,7 +680,7 @@ class HeuristicProcessor:
if style == 'text-align' and setting != 'left': if style == 'text-align' and setting != 'left':
text_align = style+':'+setting+';' text_align = style+':'+setting+';'
if style == 'text-indent': if style == 'text-indent':
setting = int(re.sub('\\s*pt\\s*', '', setting)) setting = int(re.sub(r'\s*pt\s*', '', setting))
if 9 < setting < 14: if 9 < setting < 14:
text_indent = indented_text text_indent = indented_text
else: else:
@ -853,7 +853,7 @@ class HeuristicProcessor:
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins. # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
if getattr(self.extra_opts, 'format_scene_breaks', False): if getattr(self.extra_opts, 'format_scene_breaks', False):
self.log.debug('Formatting scene breaks') self.log.debug('Formatting scene breaks')
html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html) html = re.sub(r'(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html)
html = self.detect_scene_breaks(html) html = self.detect_scene_breaks(html)
html = self.detect_whitespace(html) html = self.detect_whitespace(html)
html = self.detect_soft_breaks(html) html = self.detect_soft_breaks(html)
@ -870,9 +870,9 @@ class HeuristicProcessor:
replacement_break = self.markup_user_break(replacement_break) replacement_break = self.markup_user_break(replacement_break)
if scene_break_count >= 1: if scene_break_count >= 1:
html = detected_scene_break.sub(replacement_break, html) html = detected_scene_break.sub(replacement_break, html)
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html) html = re.sub(r'<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
else: else:
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html) html = re.sub(r'<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
if self.deleted_nbsps: if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs so they render correctly # put back non-breaking spaces in empty paragraphs so they render correctly

View File

@ -62,7 +62,7 @@ class TextRun:
self.first_html_parent = first_html_parent self.first_html_parent = first_html_parent
if self.ws_pat is None: if self.ws_pat is None:
TextRun.ws_pat = self.ws_pat = re.compile(r'\s+') TextRun.ws_pat = self.ws_pat = re.compile(r'\s+')
TextRun.soft_hyphen_pat = self.soft_hyphen_pat = re.compile('(\u00ad)') TextRun.soft_hyphen_pat = self.soft_hyphen_pat = re.compile(r'(\u00ad)')
self.style = style self.style = style
self.texts = [] self.texts = []
self.link = None self.link = None

View File

@ -42,21 +42,21 @@ def get_metadata(stream, extract_cover=True):
for comment in re.findall(br'(?ms)\\v.*?\\v', pml): for comment in re.findall(br'(?ms)\\v.*?\\v', pml):
m = re.search(br'TITLE="(.*?)"', comment) m = re.search(br'TITLE="(.*?)"', comment)
if m: if m:
mi.title = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) mi.title = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
m = re.search(br'AUTHOR="(.*?)"', comment) m = re.search(br'AUTHOR="(.*?)"', comment)
if m: if m:
if mi.authors == [_('Unknown')]: if mi.authors == [_('Unknown')]:
mi.authors = [] mi.authors = []
mi.authors.append(re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))) mi.authors.append(re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))))
m = re.search(br'PUBLISHER="(.*?)"', comment) m = re.search(br'PUBLISHER="(.*?)"', comment)
if m: if m:
mi.publisher = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) mi.publisher = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
m = re.search(br'COPYRIGHT="(.*?)"', comment) m = re.search(br'COPYRIGHT="(.*?)"', comment)
if m: if m:
mi.rights = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) mi.rights = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
m = re.search(br'ISBN="(.*?)"', comment) m = re.search(br'ISBN="(.*?)"', comment)
if m: if m:
mi.isbn = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) mi.isbn = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
return mi return mi

View File

@ -31,7 +31,7 @@ def get_metadata(stream, extract_cover=True):
mdata = mdata[:1024] mdata = mdata[:1024]
mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata) mo = re.search(r'(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata)
if mo is not None: if mo is not None:
mi.title = mo.group('title') mi.title = mo.group('title')
mi.authors = mo.group('author').split(',') mi.authors = mo.group('author').split(',')

View File

@ -393,8 +393,7 @@ class MobiReader:
self.processed_html = self.processed_html.replace('</html>', '') self.processed_html = self.processed_html.replace('</html>', '')
def remove_random_bytes(self, html): def remove_random_bytes(self, html):
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07', return re.sub(r'\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07', '', html)
'', html)
def ensure_unit(self, raw, unit='px'): def ensure_unit(self, raw, unit='px'):
if re.search(r'\d+$', raw) is not None: if re.search(r'\d+$', raw) is not None:

View File

@ -1340,7 +1340,7 @@ class EpubContainer(Container):
break break
if raw_unique_identifier is not None: if raw_unique_identifier is not None:
idpf_key = raw_unique_identifier idpf_key = raw_unique_identifier
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key) idpf_key = re.sub(r'[\u0020\u0009\u000d\u000a]', '', idpf_key)
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest() idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
return package_id, raw_unique_identifier, idpf_key return package_id, raw_unique_identifier, idpf_key

View File

@ -124,8 +124,8 @@ class Reader132(FormatReader):
if self.header_record.footnote_count > 0: if self.header_record.footnote_count > 0:
html += '<br /><h1>%s</h1>' % _('Footnotes') html += '<br /><h1>%s</h1>' % _('Footnotes')
footnoteids = re.findall( footnoteids = re.findall(r'\w+(?=\x00)',
'\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)): for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
self.log.debug('Extracting footnote page %i' % i) self.log.debug('Extracting footnote page %i' % i)
if fid < len(footnoteids): if fid < len(footnoteids):
@ -136,8 +136,8 @@ class Reader132(FormatReader):
if self.header_record.sidebar_count > 0: if self.header_record.sidebar_count > 0:
html += '<br /><h1>%s</h1>' % _('Sidebar') html += '<br /><h1>%s</h1>' % _('Sidebar')
sidebarids = re.findall( sidebarids = re.findall(r'\w+(?=\x00)',
'\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)): for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
self.log.debug('Extracting sidebar page %i' % i) self.log.debug('Extracting sidebar page %i' % i)
if sid < len(sidebarids): if sid < len(sidebarids):

View File

@ -32,7 +32,7 @@ class PdbHeaderReader:
def name(self): def name(self):
self.stream.seek(0) self.stream.seek(0)
return re.sub(b'[^-A-Za-z0-9 ]+', b'_', self.stream.read(32).replace(b'\x00', b'')) return re.sub(br'[^-A-Za-z0-9 ]+', b'_', self.stream.read(32).replace(b'\x00', b''))
def full_section_info(self, number): def full_section_info(self, number):
if not (0 <= number < self.num_sections): if not (0 <= number < self.num_sections):
@ -70,7 +70,7 @@ class PdbHeaderBuilder:
self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8') self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8')
if isinstance(title, str): if isinstance(title, str):
title = title.encode('ascii', 'replace') title = title.encode('ascii', 'replace')
self.title = b'%s\x00' % re.sub(b'[^-A-Za-z0-9 ]+', b'_', title).ljust(31, b'\x00')[:31] self.title = b'%s\x00' % re.sub(br'[^-A-Za-z0-9 ]+', b'_', title).ljust(31, b'\x00')[:31]
def build_header(self, section_lengths, out_stream): def build_header(self, section_lengths, out_stream):
''' '''

View File

@ -94,7 +94,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I) raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I) raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
raw = xml_replace_entities(raw) raw = xml_replace_entities(raw)
raw = re.sub('[\u00a0\u2029]', ' ', raw) raw = re.sub(r'[\u00a0\u2029]', ' ', raw)
i.write(raw.encode('utf-8')) i.write(raw.encode('utf-8'))

View File

@ -196,7 +196,7 @@ class PMLMLizer:
# Turn all characters that cannot be represented by themself into their # Turn all characters that cannot be represented by themself into their
# PML code equivalent # PML code equivalent
text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text) text = re.sub(r'[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)
# Remove excess spaces at beginning and end of lines # Remove excess spaces at beginning and end of lines
text = re.sub(r'(?m)^[ ]+', '', text) text = re.sub(r'(?m)^[ ]+', '', text)
@ -209,14 +209,14 @@ class PMLMLizer:
text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text) text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text)
# Remove excessive newlines. # Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text) text = re.sub(r'\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing: if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text) text = re.sub(r'\n{2,}', '\n', text)
# Only indent lines that don't have special formatting # Only indent lines that don't have special formatting
text = re.sub(r'(?imu)^(?P<text>.+)$', lambda mo: mo.group('text') text = re.sub(r'(?imu)^(?P<text>.+)$', lambda mo: mo.group('text')
if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text) if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text)
else: else:
text = re.sub('\n{3,}', '\n\n', text) text = re.sub(r'\n{3,}', '\n\n', text)
return text return text

View File

@ -61,8 +61,8 @@ def to_int(x):
def clean(text): def clean(text):
text = re.sub('\\s*\n\\s*', '\n', text) text = re.sub(r'\s*\n\s*', '\n', text)
text = re.sub('[ \t]{2,}', ' ', text) text = re.sub(r'[ \t]{2,}', ' ', text)
return text.strip() return text.strip()

View File

@ -199,7 +199,7 @@ class RTFMLizer:
# Remove excessive spaces # Remove excessive spaces
text = re.sub(r'[ ]{2,}', ' ', text) text = re.sub(r'[ ]{2,}', ' ', text)
text = re.sub('\t{2,}', '\t', text) text = re.sub(r'\t{2,}', '\t', text)
text = text.replace('\t ', '\t') text = text.replace('\t ', '\t')
# Remove excessive line breaks # Remove excessive line breaks

View File

@ -719,7 +719,7 @@ class ProcessTokens:
def divide_num(self, numerator, denominator): def divide_num(self, numerator, denominator):
try: try:
# calibre why ignore negative number? Wrong in case of \fi # calibre why ignore negative number? Wrong in case of \fi
numerator = float(re.search('[0-9.\\-]+', numerator).group()) numerator = float(re.search(r'[0-9.\-]+', numerator).group())
except TypeError: except TypeError:
if self.__run_level > 3: if self.__run_level > 3:
msg = ('No number to process?\nthis indicates that the token \\(\\li\\) \ msg = ('No number to process?\nthis indicates that the token \\(\\li\\) \

View File

@ -162,12 +162,12 @@ class SNBMLizer:
# text = re.sub('[ ]{2,}', ' ', text) # text = re.sub('[ ]{2,}', ' ', text)
# Remove excessive newlines. # Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text) text = re.sub(r'\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing: if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text) text = re.sub(r'\n{2,}', '\n', text)
text = re.sub(r'(?imu)^(?=.)', '\t', text) text = re.sub(r'(?imu)^(?=.)', '\t', text)
else: else:
text = re.sub('\n{3,}', '\n\n', text) text = re.sub(r'\n{3,}', '\n\n', text)
# Replace spaces at the beginning and end of lines # Replace spaces at the beginning and end of lines
text = re.sub(r'(?imu)^[ ]+', '', text) text = re.sub(r'(?imu)^[ ]+', '', text)

View File

@ -58,7 +58,7 @@ class MarkdownMLizer(OEB2HTML):
# Remove tabs that aren't at the beginning of a line # Remove tabs that aren't at the beginning of a line
new_text = [] new_text = []
for l in text.splitlines(): for l in text.splitlines():
start = re.match('\t+', l) start = re.match(r'\t+', l)
if start: if start:
start = start.group() start = start.group()
else: else:
@ -71,7 +71,7 @@ class MarkdownMLizer(OEB2HTML):
text = re.sub(r'(?msu)^[ ]+$', '', text) text = re.sub(r'(?msu)^[ ]+$', '', text)
# Reduce blank lines # Reduce blank lines
text = re.sub('(?msu)\n{7,}', '\n' * 6, text) text = re.sub(r'(?msu)\n{7,}', '\n' * 6, text)
# Remove blank lines at beginning and end of document. # Remove blank lines at beginning and end of document.
text = re.sub(r'^\s*', '', text) text = re.sub(r'^\s*', '', text)

View File

@ -31,7 +31,7 @@ def clean_txt(txt):
txt = '\n'.join([line.rstrip() for line in txt.splitlines()]) txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
# Replace whitespace at the beginning of the line with &nbsp; # Replace whitespace at the beginning of the line with &nbsp;
txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', '&nbsp;' * 4, txt) txt = re.sub(r'(?m)(?<=^)([ ]{2,}|\t+)(?=.)', '&nbsp;' * 4, txt)
# Condense redundant spaces # Condense redundant spaces
txt = re.sub(r'[ ]{2,}', ' ', txt) txt = re.sub(r'[ ]{2,}', ' ', txt)
@ -40,7 +40,7 @@ def clean_txt(txt):
txt = re.sub(r'^\s+(?=.)', '', txt) txt = re.sub(r'^\s+(?=.)', '', txt)
txt = re.sub(r'(?<=.)\s+$', '', txt) txt = re.sub(r'(?<=.)\s+$', '', txt)
# Remove excessive line breaks. # Remove excessive line breaks.
txt = re.sub('\n{5,}', '\n\n\n\n', txt) txt = re.sub(r'\n{5,}', '\n\n\n\n', txt)
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24 # remove ASCII invalid chars : 0 to 8 and 11-14 to 24
txt = clean_ascii_chars(txt) txt = clean_ascii_chars(txt)
@ -190,7 +190,7 @@ def separate_paragraphs_single_line(txt):
def separate_paragraphs_print_formatted(txt): def separate_paragraphs_print_formatted(txt):
txt = re.sub('(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt) txt = re.sub(r'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
return txt return txt

View File

@ -109,16 +109,16 @@ class TextileMLizer(OEB2HTML):
# reduce blank lines # reduce blank lines
text = re.sub(r'\n{3}', r'\n\np. \n\n', text) text = re.sub(r'\n{3}', r'\n\np. \n\n', text)
text = re.sub('%\n(p[<>=]{1,2}\\.|p\\.)', r'%\n\n\1', text) text = re.sub(r'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
# Check span following blank para # Check span following blank para
text = re.sub(r'\n+ +%', r' %', text) text = re.sub(r'\n+ +%', r' %', text)
text = re.sub('p[<>=]{1,2}\\.\n\n?', r'', text) text = re.sub(r'p[<>=]{1,2}\.\n\n?', '', text)
# blank paragraph # blank paragraph
text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text) text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text)
# blank paragraph # blank paragraph
text = text.replace('\n\xa0', '\np. ') text = text.replace('\n\xa0', '\np. ')
# blank paragraph # blank paragraph
text = re.sub('\np[<>=]{1,2}?\\. \xa0', r'\np. ', text) text = re.sub(r'\np[<>=]{1,2}?\\. \xa0', r'\np. ', text)
text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
# sort out spaces in tables # sort out spaces in tables

View File

@ -123,19 +123,19 @@ class TXTMLizer:
text = text.replace('\f+', ' ') text = text.replace('\f+', ' ')
# Single line paragraph. # Single line paragraph.
text = re.sub('(?<=.)\n(?=.)', ' ', text) text = re.sub(r'(?<=.)\n(?=.)', ' ', text)
# Remove multiple spaces. # Remove multiple spaces.
text = re.sub(r'[ ]{2,}', ' ', text) text = re.sub(r'[ ]{2,}', ' ', text)
# Remove excessive newlines. # Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text) text = re.sub(r'\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing: if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text) text = re.sub(r'\n{2,}', '\n', text)
text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: '%s\n\n' % mo.group('t'), text) text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: '%s\n\n' % mo.group('t'), text)
text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '{}\n\n\n\n\n\n{}'.format(mo.group('b'), mo.group('t')), text) text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '{}\n\n\n\n\n\n{}'.format(mo.group('b'), mo.group('t')), text)
else: else:
text = re.sub('\n{7,}', '\n\n\n\n\n\n', text) text = re.sub(r'\n{7,}', '\n\n\n\n\n\n', text)
# Replace spaces at the beginning and end of lines # Replace spaces at the beginning and end of lines
# We don't replace tabs because those are only added # We don't replace tabs because those are only added

View File

@ -87,4 +87,4 @@ class Jadecoder(Unidecoder):
text = self.conv.do(text) text = self.conv.do(text)
except Exception: except Exception:
pass pass
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text) return re.sub(r'[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text)

View File

@ -73,7 +73,7 @@ class Unidecoder:
def decode(self, text): def decode(self, text):
# Replace characters larger than 127 with their ASCII equivalent. # Replace characters larger than 127 with their ASCII equivalent.
return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text) return re.sub(r'[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text)
def replace_point(self, codepoint): def replace_point(self, codepoint):
''' '''

View File

@ -733,7 +733,7 @@ class ResultDetails(QWidget):
def render_results(self, results, individual_match=None): def render_results(self, results, individual_match=None):
html = [] html = []
space_pat = re.compile(r'\s+') space_pat = re.compile(r'\s+')
markup_pat = re.compile('\x1d') markup_pat = re.compile(r'\x1d')
def markup_text(text): def markup_text(text):
count = 0 count = 0

View File

@ -433,7 +433,7 @@ def run_gui_(opts, args, app, gui_debug=None):
winutil.prepare_for_restart() winutil.prepare_for_restart()
with open(debugfile, 'r+b') as f: with open(debugfile, 'r+b') as f:
raw = f.read() raw = f.read()
raw = re.sub(b'(?<!\r)\n', b'\r\n', raw) raw = re.sub(br'(?<!\r)\n', br'\r\n', raw)
f.seek(0) f.seek(0)
f.truncate() f.truncate()
f.write(raw) f.write(raw)

View File

@ -39,7 +39,7 @@ attribute_name_pat = re.compile(r'''[^%s"'/><=]+''' % space_chars)
self_closing_pat = re.compile(r'/\s*>') self_closing_pat = re.compile(r'/\s*>')
unquoted_val_pat = re.compile(r'''[^%s'"=<>`]+''' % space_chars) unquoted_val_pat = re.compile(r'''[^%s'"=<>`]+''' % space_chars)
cdata_close_pats = {x:re.compile(r'</%s' % x, flags=re.I) for x in cdata_tags} cdata_close_pats = {x:re.compile(r'</%s' % x, flags=re.I) for x in cdata_tags}
nbsp_pat = re.compile('[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+') # special spaces and hyphens nbsp_pat = re.compile(r'[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+') # special spaces and hyphens
NORMAL = 0 NORMAL = 0
IN_OPENING_TAG = 1 IN_OPENING_TAG = 1

View File

@ -119,7 +119,7 @@ class NumberToText: # {{{
self.text = NumberToText(self.number.replace('%',' percent')).text self.text = NumberToText(self.number.replace('%',' percent')).text
# Test for decimal # Test for decimal
elif re.search('\\.',self.number): elif '.' in self.number:
if self.verbose: if self.verbose:
self.log('Decimal: %s' % self.number) self.log('Decimal: %s' % self.number)
self.number_as_float = self.number self.number_as_float = self.number
@ -150,7 +150,7 @@ class NumberToText: # {{{
self.text = NumberToText(self.number_as_float).text self.text = NumberToText(self.number_as_float).text
# Test for hybrid e.g., 'K2, 2nd, 10@10' # Test for hybrid e.g., 'K2, 2nd, 10@10'
elif re.search('[\\D]+', self.number): elif re.search(r'[\D]+', self.number):
if self.verbose: if self.verbose:
self.log('Hybrid: %s' % self.number) self.log('Hybrid: %s' % self.number)
# Split the token into number/text # Split the token into number/text

View File

@ -11,7 +11,7 @@ from calibre.utils.html2text import html2text
# Hackish - ignoring sentences ending or beginning in numbers to avoid # Hackish - ignoring sentences ending or beginning in numbers to avoid
# confusion with decimal points. # confusion with decimal points.
lost_cr_pat = re.compile('([a-z])([\\.\\?!])([A-Z])') lost_cr_pat = re.compile(r'([a-z])([\.\?!])([A-Z])')
lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])') lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])')
sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe', sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe',
re.IGNORECASE) re.IGNORECASE)

View File

@ -674,7 +674,7 @@ class CustomColumns:
editable=True, display={}): editable=True, display={}):
if not label: if not label:
raise ValueError(_('No label was provided')) raise ValueError(_('No label was provided'))
if re.match('^\\w*$', label) is None or not label[0].isalpha() or label.lower() != label: if re.match(r'^\w*$', label) is None or not label[0].isalpha() or label.lower() != label:
raise ValueError(_('The label must contain only lower case letters, digits and underscores, and start with a letter')) raise ValueError(_('The label must contain only lower case letters, digits and underscores, and start with a letter'))
if datatype not in self.CUSTOM_DATA_TYPES: if datatype not in self.CUSTOM_DATA_TYPES:
raise ValueError('%r is not a supported data type'%datatype) raise ValueError('%r is not a supported data type'%datatype)

View File

@ -193,7 +193,7 @@ def load_dictionary(dictionary):
class Dictionaries: class Dictionaries:
def __init__(self): def __init__(self):
self.remove_hyphenation = re.compile('[\u2010-]+') self.remove_hyphenation = re.compile(r'[\u2010-]+')
self.negative_pat = re.compile(r'-[.\d+]') self.negative_pat = re.compile(r'-[.\d+]')
self.fix_punctuation_pat = re.compile(r'''[:.]''') self.fix_punctuation_pat = re.compile(r'''[:.]''')
self.dictionaries = {} self.dictionaries = {}

View File

@ -2553,7 +2553,7 @@ class BibTeX:
self.ascii_bibtex = True self.ascii_bibtex = True
# This substitution is based on the description of cite key restrictions at # This substitution is based on the description of cite key restrictions at
# http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html # http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html
self.invalid_cit = re.compile('[ "@\',\\#}{~%&$^]') self.invalid_cit = re.compile(r'[ "@\',\#}{~%&$^]')
self.upper = re.compile('[' + self.upper = re.compile('[' +
string.ascii_uppercase + ']') string.ascii_uppercase + ']')
self.escape = re.compile(r'[#&%_]') self.escape = re.compile(r'[#&%_]')

View File

@ -94,4 +94,4 @@ def unescape(text, rm=False, rchar=''):
if rm: if rm:
return rchar # replace by char return rchar # replace by char
return text # leave as is return text # leave as is
return re.sub('&#?\\w+;', fixup, text) return re.sub(r'&#?\w+;', fixup, text)

View File

@ -545,7 +545,7 @@ class RecursiveFetcher:
dsrc = self.fetch_url(iurl) dsrc = self.fetch_url(iurl)
newbaseurl = dsrc.newurl newbaseurl = dsrc.newurl
if len(dsrc) == 0 or \ if len(dsrc) == 0 or \
len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0: len(re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
raise ValueError('No content at URL %r'%iurl) raise ValueError('No content at URL %r'%iurl)
if callable(self.encoding): if callable(self.encoding):
dsrc = self.encoding(dsrc) dsrc = self.encoding(dsrc)

View File

@ -60,7 +60,7 @@ def styleFromList(styleName, specArray, spacing, showAllLevels):
displayLevels = 0 displayLevels = 0
listStyle = ListStyle(name=styleName) listStyle = ListStyle(name=styleName)
numFormatPattern = re.compile(r'([1IiAa])') numFormatPattern = re.compile(r'([1IiAa])')
cssLengthPattern = re.compile('([^a-z]+)\\s*([a-z]+)?') cssLengthPattern = re.compile(r'([^a-z]+)\s*([a-z]+)?')
m = cssLengthPattern.search(spacing) m = cssLengthPattern.search(spacing)
if (m is not None): if (m is not None):
cssLengthNum = float(m.group(1)) cssLengthNum = float(m.group(1))