always use raw-string for regex (manual)

ruff 'RUF039'
This commit is contained in:
un-pogaz 2025-01-24 11:14:20 +01:00
parent ac6912565a
commit 3720de10d2
70 changed files with 136 additions and 142 deletions

View File

@ -16,5 +16,5 @@ class Android_com_pl(BasicNewsRecipe):
remove_tags_after = [{'class': 'post-content'}]
remove_tags = [dict(name='ul', attrs={'class': 'tags small-tags'}), dict(name='a', attrs={'onclick': 'return ss_plugin_loadpopup_js(this);'})]
preprocess_regexps = [
(re.compile(u'<p>.{,1}</p>', re.DOTALL), lambda match: '')]
(re.compile(r'<p>.{,1}</p>', re.DOTALL), lambda match: '')]
feeds = [(u'Android', u'http://android.com.pl/feed/')]

View File

@ -17,8 +17,8 @@ class BenchmarkPl(BasicNewsRecipe):
extra_css = 'ul {list-style-type: none;}'
no_stylesheets = True
use_embedded_content = False
preprocess_regexps = [(re.compile(u'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', # noqa: E501
re.DOTALL | re.IGNORECASE), lambda match: '</body>'), (re.compile(u'Więcej o .*?</ul>', re.DOTALL | re.IGNORECASE), lambda match: '')]
preprocess_regexps = [(re.compile(u'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', # noqa: E501, RUF039
re.DOTALL | re.IGNORECASE), lambda match: '</body>'), (re.compile(u'Więcej o .*?</ul>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: RUF039
keep_only_tags = [dict(id=['articleHeader', 'articleGallery']), dict(
name='div', attrs={'class': ['m_zwykly', 'gallery']}), dict(id='article')]

View File

@ -41,7 +41,7 @@ class Blic(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags_before = dict(name='div', attrs={'id': 'article_info'})
remove_tags = [
dict(name=['object', 'link', 'meta', 'base', 'object', 'embed'])]

View File

@ -27,7 +27,7 @@ class CveceZla(BasicNewsRecipe):
'comment': description, 'tags': 'igre, muzika, film, blog, Srbija', 'publisher': 'Mehmet Krljic', 'language': language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags_before = dict(attrs={'class': 'navigation'})
remove_tags_after = dict(attrs={'class': 'commentlist'})

View File

@ -61,7 +61,7 @@ class MediaDaumRecipe(BasicNewsRecipe):
lambda match: '<em>'),
(re.compile(r'<i>(<br[^>]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: '<i>'),
(re.compile(u'(<br[^>]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\\(c\\))*\\[[^\\]]*(\u24D2|\\(c\\)|\uAE30\uC0AC|\uC778\uAE30[^\\]]*\uB274\uC2A4)[^\\]]*\\].*</div>', re.DOTALL | re.IGNORECASE), # noqa: E501
(re.compile(u'(<br[^>]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\\(c\\))*\\[[^\\]]*(\u24D2|\\(c\\)|\uAE30\uC0AC|\uC778\uAE30[^\\]]*\uB274\uC2A4)[^\\]]*\\].*</div>', re.DOTALL | re.IGNORECASE), # noqa: E501, RUF039
lambda match: '</div>'),
]

View File

@ -42,7 +42,7 @@ class DnevnikCro(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
keep_only_tags = [dict(name='div', attrs={'id': 'article'})]

View File

@ -20,8 +20,8 @@ class DziennikWschodni(BasicNewsRecipe):
no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(u'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa: E501
(re.compile(u'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: E501
preprocess_regexps = [(re.compile(u'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa: E501, RUF039
(re.compile(u'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: E501, RUF039
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',

View File

@ -45,9 +45,7 @@ class Esensja(BasicNewsRecipe):
'''
preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''),
(re.compile(
u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''),
]
(re.compile(r'(title|alt)="[^"]*?"', re.DOTALL), lambda match: '')]
def parse_index(self):
soup = self.index_to_soup('http://www.esensja.pl/magazyn/')

View File

@ -23,9 +23,7 @@ class EsensjaRSS(BasicNewsRecipe):
remove_javascript = True
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''),
(re.compile(
u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''),
]
(re.compile(r'(title|alt)="[^"]*?"', re.DOTALL), lambda match: '')]
remove_attributes = ['style', 'bgcolor', 'alt', 'color']
keep_only_tags = [dict(attrs={'class': 'sekcja'}), ]
remove_tags_after = dict(id='tekst')

View File

@ -23,7 +23,7 @@ class FilmWebPl(BasicNewsRecipe):
'ul.sep-line > li + li::before {content: " | "} '
'ul.inline {padding:0px;} .vertical-align {display: inline-block;}')
preprocess_regexps = [(re.compile(r'<body.+?</head>', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags...
(re.compile(u'(?:<sup>)?\\(kliknij\\,\\ aby powiększyć\\)(?:</sup>)?', re.IGNORECASE), lambda m: ''),
(re.compile(u'(?:<sup>)?\\(kliknij\\,\\ aby powiększyć\\)(?:</sup>)?', re.IGNORECASE), lambda m: ''), # noqa: RUF039
(re.compile(type(u'')(r'(<br ?/?>\s*?<br ?/?>\s*?)+'), re.IGNORECASE), lambda m: '<br />')
]
remove_tags = [dict(attrs={'class':['infoParent', 'likeBar',

View File

@ -34,7 +34,7 @@ class gw_krakow(BasicNewsRecipe):
# rules for gazeta.pl
preprocess_regexps = [
(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')] # noqa: RUF039
keep_only_tags = [dict(id='gazeta_article')]
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(
attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]

View File

@ -33,7 +33,7 @@ class gw_wawa(BasicNewsRecipe):
# rules for gazeta.pl
preprocess_regexps = [
(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')] # noqa: RUF039
keep_only_tags = [dict(id='gazeta_article')]
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(
attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]

View File

@ -54,11 +54,11 @@ def solve_captcha(captcha):
# Parse into parts
pattern = re.compile(
u'(?P<first_component>[0-9]+)?'
u'\\s*(?P<operator>[+×])\\s*'
u'(?P<second_component>[0-9]+)'
u'\\s*(=)\\s*'
u'(?P<result>[0-9]+)?', re.UNICODE)
r'(?P<first_component>[0-9]+)?'
u'\\s*(?P<operator>[+×])\\s*' # noqa: RUF039
r'(?P<second_component>[0-9]+)'
r'\s*(=)\s*'
r'(?P<result>[0-9]+)?', re.UNICODE)
calculationParts = re.search(pattern, numeric_problem)
if calculationParts is None:

View File

@ -16,7 +16,7 @@ class in4(BasicNewsRecipe):
no_stylesheets = True
remove_empty_feeds = True
preprocess_regexps = [
(re.compile(u'<a title="translate into.*?</a>', re.DOTALL), lambda match: '')]
(re.compile(r'<a title="translate into.*?</a>', re.DOTALL), lambda match: '')]
keep_only_tags = [dict(name='div', attrs={'class': 'left_alone'})]
remove_tags_after = dict(name='img', attrs={'title': 'komentarze'})
remove_tags = [dict(name='img', attrs={'title': 'komentarze'})]

View File

@ -29,9 +29,9 @@ class KopalniaWiedzy(BasicNewsRecipe):
extra_css = '.topimage {margin-top: 30px}'
preprocess_regexps = [
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
(re.compile(r'<a .* rel="lightboxText" .*><img (.*)></a>'),
lambda match: '<img class="topimage" ' + match.group(1) + '>'),
(re.compile(u'<br /><br />'),
(re.compile(r'<br /><br />'),
lambda match: '<br/>')
]

View File

@ -66,7 +66,7 @@ class LeMondeAbonne(BasicNewsRecipe):
dict(name='div', attrs={'class': 'po-copy'})
]
article_id_pattern = re.compile('[0-9]+\\.html')
article_id_pattern = re.compile(r'[0-9]+\.html')
article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/'
def get_browser(self):

View File

@ -43,7 +43,7 @@ class NacionalCro(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags = [dict(name=['object', 'link', 'embed'])]

View File

@ -11,8 +11,8 @@ class NaTemat(BasicNewsRecipe):
description = u'informacje, komentarze, opinie'
category = 'news'
language = 'pl'
preprocess_regexps = [(re.compile(u'Czytaj też\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Zobacz też\\:.*?</a>', re.IGNORECASE), lambda m: ''), # noqa: E501
(re.compile(u'Czytaj więcej\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Czytaj również\\:.*?</a>', re.IGNORECASE), lambda m: '')] # noqa: E501
preprocess_regexps = [(re.compile(u'Czytaj też\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Zobacz też\\:.*?</a>', re.IGNORECASE), lambda m: ''), # noqa: E501, RUF039
(re.compile(u'Czytaj więcej\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Czytaj również\\:.*?</a>', re.IGNORECASE), lambda m: '')] # noqa: E501, RUF039
cover_url = 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png'
no_stylesheets = True
keep_only_tags = [dict(id='main')]

View File

@ -34,7 +34,7 @@ class NjuzNet(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
keep_only_tags = [
dict(attrs={'id': 'entryMeta'}), dict(attrs={'class': 'post'})

View File

@ -36,7 +36,7 @@ class NoviList_hr(BasicNewsRecipe):
p{display: block}
'''
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True

View File

@ -35,7 +35,7 @@ class Novosti(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'pretty_print': True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
keep_only_tags = [dict(attrs={'class': [
'articleTitle', 'articleInfo', 'articleLead', 'singlePhoto fl', 'articleBody']})]

View File

@ -45,7 +45,7 @@ class Nspm(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'pretty_print': True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags = [dict(name=['link', 'script', 'meta', 'base', 'img'])]
remove_attributes = ['width', 'height', 'lang', 'xmlns:fb',
'xmlns:og', 'vspace', 'hspace', 'type', 'start', 'size']

View File

@ -31,8 +31,7 @@ class ObservatorulCultural(BasicNewsRecipe):
def parse_index(self):
soup = self.index_to_soup(
'http://www.observatorcultural.ro/Arhiva*-archive.html')
issueTag = soup.find('a', href=re.compile(
'observatorcultural.ro\\/Numarul'))
issueTag = soup.find('a', href=re.compile(r'observatorcultural.ro/Numarul'))
issueURL = issueTag['href']
print(issueURL)
issueSoup = self.index_to_soup(issueURL)

View File

@ -34,7 +34,7 @@ class Pescanik(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags = [
dict(name=['object', 'link', 'meta', 'script', 'iframe', 'embed'])]
keep_only_tags = [

View File

@ -31,9 +31,9 @@ class AdvancedUserRecipe1289939440(BasicNewsRecipe):
]
preprocess_regexps = [
(re.compile(u'<p class="perex[^"]*">[^<]*<img[^>]*>',
(re.compile(r'<p class="perex[^"]*">[^<]*<img[^>]*>',
re.DOTALL), lambda match: '<p class="intro">'),
(re.compile(u'<h3><a name="tucnak">Tričko tučňák.*</body>',
(re.compile(u'<h3><a name="tucnak">Tričko tučňák.*</body>', # noqa: RUF039
re.DOTALL), lambda match: '<!--deleted-->')
]

View File

@ -38,7 +38,7 @@ class RTS(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
feeds = [

View File

@ -25,4 +25,4 @@ class swiatczytnikow(BasicNewsRecipe):
dict(name='div', attrs={'class': 'feedflare'})]
preprocess_regexps = [
(re.compile(u'<h3>Czytaj dalej:</h3>'), lambda match: '')]
(re.compile(u'<h3>Czytaj dalej:</h3>'), lambda match: '')] # noqa: RUF039

View File

@ -15,8 +15,8 @@ class Tablety_pl(BasicNewsRecipe):
no_stylesheets = True
oldest_article = 8
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(u'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''),
(re.compile(u'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
preprocess_regexps = [(re.compile(u'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), # noqa: RUF039
(re.compile(u'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')] # noqa: RUF039
keep_only_tags = [dict(attrs={'class': ['featured-image', 'article-content clearfix']})]
remove_tags = [dict(attrs={'class': ['comments_icon', 'wp-polls', 'entry-comments',
'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer', 'social-custom']})]

View File

@ -27,7 +27,7 @@ class TheCultOfGhoul(BasicNewsRecipe):
'comment': description, 'tags': 'film, blog, srbija, strava, uzas', 'publisher': 'Dejan Ognjanovic', 'language': language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
feeds = [(u'Posts', u'http://cultofghoul.blogspot.com/feeds/posts/default')]

View File

@ -43,7 +43,7 @@ class VecernjiList(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags = [
dict(name=['object', 'link', 'embed']), dict(

View File

@ -41,7 +41,7 @@ class Vreme(BasicNewsRecipe):
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
remove_tags_before = dict(attrs={'class': 'toc-heading'})
remove_tags_after = dict(attrs={'class': 'footer'})

View File

@ -10,8 +10,8 @@ class WNP(BasicNewsRecipe):
description = u'Wirtualny Nowy Przemysł'
category = 'economy'
language = 'pl'
preprocess_regexps = [(re.compile(u'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''),
(re.compile(u'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')]
preprocess_regexps = [(re.compile(u'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''), # noqa: RUF039
(re.compile(u'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')] # noqa: RUF039
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets = True

View File

@ -48,7 +48,7 @@ class ZeitEPUBAbo(BasicNewsRecipe):
preprocess_regexps = [
# filtering for correct dashes ("Gedankenstrich" and "bis")
(re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'),
(re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'), # noqa: RUF039
(re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number
(re.compile(r'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro
# fix the number dash number dash for the title image that was broken
@ -130,9 +130,9 @@ class ZeitEPUBAbo(BasicNewsRecipe):
(re.compile(
r'(?<=<p class="absatz">[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''),
# before closing quotation
(re.compile(u' \u00AB'), lambda match: u'\u00AB '),
(re.compile(u' \u00AB'), lambda match: u'\u00AB '), # noqa: RUF039
# after opening quotation
(re.compile(u'\u00BB '), lambda match: u' \u00BB'),
(re.compile(u'\u00BB '), lambda match: u' \u00BB'), # noqa: RUF039
# filtering for spaces in large numbers for better readability
# end of the number with some character following
(re.compile(r'(?<=\d\d)(?=\d\d\d[ ,;\)<\?!-])'),
@ -151,25 +151,25 @@ class ZeitEPUBAbo(BasicNewsRecipe):
# filtering for unicode characters that are missing on the Kindle,
# try to replace them with meaningful work-arounds
# subscript-0
(re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'),
(re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), # noqa: RUF039
# subscript-1
(re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'),
(re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), # noqa: RUF039
# subscript-2
(re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'),
(re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), # noqa: RUF039
# subscript-3
(re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'),
(re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), # noqa: RUF039
# subscript-4
(re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'),
(re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), # noqa: RUF039
# subscript-5
(re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'),
(re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), # noqa: RUF039
# subscript-6
(re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'),
(re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), # noqa: RUF039
# subscript-7
(re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'),
(re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), # noqa: RUF039
# subscript-8
(re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'),
(re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), # noqa: RUF039
# subscript-9
(re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'),
(re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), # noqa: RUF039
# always chance CO2
(re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'), # CO2
# remove *** paragraphs

View File

@ -21,7 +21,7 @@ class PagebreakPageGenerator(IPageGenerator):
''' Determine pages based on the presence of <*pagebreak*/>. '''
html = mobi_html(mobi_file_path)
pages = []
for m in re.finditer(b'<[^>]*pagebreak[^>]*>', html):
for m in re.finditer(br'<[^>]*pagebreak[^>]*>', html):
pages.append(m.end())
return Pages(pages)

View File

@ -32,7 +32,7 @@ class TCRCompressor:
The intent is to create more unused codes.
'''
possible_codes = []
a_code = set(re.findall(b'(?ms).', self.coded_txt))
a_code = set(re.findall(br'(?ms).', self.coded_txt))
for code in a_code:
single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt))
@ -57,7 +57,7 @@ class TCRCompressor:
'''
Create new codes from codes that occur in pairs often.
'''
possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt)))
possible_new_codes = list(set(re.findall(br'(?ms)..', self.coded_txt)))
new_codes_count = []
for c in possible_new_codes:
@ -74,7 +74,7 @@ class TCRCompressor:
def compress(self, txt):
self._reset()
self.codes = list(set(re.findall(b'(?ms).', txt)))
self.codes = list(set(re.findall(br'(?ms).', txt)))
# Replace the text with their corresponding code
# FIXME: python3 is native bytearray, but all we want are bytes

View File

@ -46,7 +46,7 @@ class EPUBInput(InputFormatPlugin):
from lxml import etree
idpf_key = opf.raw_unique_identifier
if idpf_key:
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
idpf_key = re.sub(r'[\u0020\u0009\u000d\u000a]', '', idpf_key)
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
key = None
for item in opf.identifier_iter():

View File

@ -503,7 +503,7 @@ class EPUBOutput(OutputFormatPlugin):
tag.tag = XHTML('div')
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
special_chars = re.compile('[\u200b\u00ad]')
special_chars = re.compile(r'[\u200b\u00ad]')
for elem in root.iterdescendants('*'):
if elem.text:
elem.text = special_chars.sub('', elem.text)

View File

@ -296,7 +296,7 @@ class RTFInput(InputFormatPlugin):
res = as_bytes(transform.tostring(result))
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# clean multiple \n
res = re.sub(b'\n+', b'\n', res)
res = re.sub(br'\n+', b'\n', res)
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
# res = re.sub('\s*<body>', '<body>', res)
# res = re.sub('(?<=\n)\n{2}',

View File

@ -94,7 +94,7 @@ class DocAnalysis:
elif format == 'spanned_html':
linere = re.compile(r'(?<=<span).*?(?=</span>)', re.DOTALL)
elif format == 'txt':
linere = re.compile('.*?\n')
linere = re.compile(r'.*?\n')
self.lines = linere.findall(raw)
def line_length(self, percent):

View File

@ -57,8 +57,8 @@ class HeuristicProcessor:
' chapters. - ' + str(chap))
return '<h2>'+chap+'</h2>\n'
else:
delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
delete_quotes = re.compile('\'"')
delete_whitespace = re.compile(r'^\s*(?P<c>.*?)\s*$')
delete_quotes = re.compile(r'\'"')
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
self.html_preprocess_sections = self.html_preprocess_sections + 1
@ -109,7 +109,7 @@ class HeuristicProcessor:
be marked up to return true.
'''
htm_end_ere = re.compile(r'</(p|div)>', re.DOTALL)
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
line_end_ere = re.compile(r'(\n|\r|\r\n)', re.DOTALL)
htm_end = htm_end_ere.findall(raw)
line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end)
@ -417,7 +417,7 @@ class HeuristicProcessor:
# Add markup naively
# TODO - find out if there are cases where there are more than one <pre> tag or
# other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?<!>)(\n)')
add_markup = re.compile(r'(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html)
return html
@ -440,7 +440,7 @@ class HeuristicProcessor:
# Get rid of empty <o:p> tags to simplify other processing
html = re.sub(r'\s*<o:p>\s*</o:p>', ' ', html)
# Delete microsoft 'smart' tags
html = re.sub('(?i)</?st1:\\w+>', '', html)
html = re.sub(r'(?i)</?st1:\w+>', '', html)
# Re-open self closing paragraph tags
html = re.sub(r'<p[^>/]*/>', '<p> </p>', html)
# Get rid of empty span, bold, font, em, & italics tags
@ -451,7 +451,7 @@ class HeuristicProcessor:
html = re.sub(
r'\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}'.format(open=open_fmt_pat, close=close_fmt_pat) , ' ', html)
# delete surrounding divs from empty paragraphs
html = re.sub('<div[^>]*>\\s*<p[^>]*>\\s*</p>\\s*</div>', '<p> </p>', html)
html = re.sub(r'<div[^>]*>\s*<p[^>]*>\s*</p>\s*</div>', '<p> </p>', html)
# Empty heading tags
html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
self.deleted_nbsps = True
@ -538,7 +538,7 @@ class HeuristicProcessor:
elif content.find('scenebreak') != -1:
return content
else:
content = re.sub('(?i)<h(?P<hnum>\\d+)[^>]*>', '\n\n<h'+'\\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
content = re.sub(r'(?i)<h(?P<hnum>\d+)[^>]*>', r'\n\n<h\g<hnum> style="'+top_margin+bottom_margin+'">', content)
return content
html = blanks_around_headings.sub(merge_header_whitespace, html)
@ -551,7 +551,7 @@ class HeuristicProcessor:
html = blanks_n_nopunct.sub(markup_whitespaces, html)
if self.html_preprocess_sections > self.min_chapters:
html = re.sub('(?si)^.*?(?=<h\\d)', markup_whitespaces, html)
html = re.sub(r'(?si)^.*?(?=<h\d)', markup_whitespaces, html)
return html
@ -600,13 +600,13 @@ class HeuristicProcessor:
if re.match(r'^<hr', replacement_break):
if replacement_break.find('width') != -1:
try:
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
width = int(re.sub(r'.*?width(:|=)(?P<wnum>\d+).*', r'\g<wnum>', replacement_break))
except:
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
self.log.warn('Invalid replacement scene break'
' expression, using default')
else:
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
replacement_break = re.sub(r'(?i)(width=\d+\\%?|width:\s*\d+(\%|px|pt|em)?;?)', '', replacement_break)
divpercent = (100 - width) // 2
hr_open = re.sub(r'45', str(divpercent), hr_open)
scene_break = hr_open+replacement_break+'</div>'
@ -617,24 +617,24 @@ class HeuristicProcessor:
else:
from calibre.utils.html2text import html2text
replacement_break = html2text(replacement_break)
replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
replacement_break = re.sub(r'\s', '&nbsp;', replacement_break)
scene_break = self.scene_break_open+replacement_break+'</p>'
else:
replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
replacement_break = re.sub(r'\s', '&nbsp;', replacement_break)
scene_break = self.scene_break_open+replacement_break+'</p>'
return scene_break
def check_paragraph(self, content):
content = re.sub('\\s*</?span[^>]*>\\s*', '', content)
if re.match('.*["\'.!?:]$', content):
content = re.sub(r'\s*</?span[^>]*>\s*', '', content)
if re.match(r'.*["\'.!?:]$', content):
# print('detected this as a paragraph')
return True
else:
return False
def abbyy_processor(self, html):
abbyy_line = re.compile('((?P<linestart><p\\sstyle="(?P<styles>[^"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
abbyy_line = re.compile(r'((?P<linestart><p\sstyle="(?P<styles>[^"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
empty_paragraph = '\n<p> </p>\n'
self.in_blockquote = False
self.previous_was_paragraph = False
@ -680,7 +680,7 @@ class HeuristicProcessor:
if style == 'text-align' and setting != 'left':
text_align = style+':'+setting+';'
if style == 'text-indent':
setting = int(re.sub('\\s*pt\\s*', '', setting))
setting = int(re.sub(r'\s*pt\s*', '', setting))
if 9 < setting < 14:
text_indent = indented_text
else:
@ -853,7 +853,7 @@ class HeuristicProcessor:
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
if getattr(self.extra_opts, 'format_scene_breaks', False):
self.log.debug('Formatting scene breaks')
html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html)
html = re.sub(r'(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html)
html = self.detect_scene_breaks(html)
html = self.detect_whitespace(html)
html = self.detect_soft_breaks(html)
@ -870,9 +870,9 @@ class HeuristicProcessor:
replacement_break = self.markup_user_break(replacement_break)
if scene_break_count >= 1:
html = detected_scene_break.sub(replacement_break, html)
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
html = re.sub(r'<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
else:
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
html = re.sub(r'<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs so they render correctly

View File

@ -62,7 +62,7 @@ class TextRun:
self.first_html_parent = first_html_parent
if self.ws_pat is None:
TextRun.ws_pat = self.ws_pat = re.compile(r'\s+')
TextRun.soft_hyphen_pat = self.soft_hyphen_pat = re.compile('(\u00ad)')
TextRun.soft_hyphen_pat = self.soft_hyphen_pat = re.compile(r'(\u00ad)')
self.style = style
self.texts = []
self.link = None

View File

@ -42,21 +42,21 @@ def get_metadata(stream, extract_cover=True):
for comment in re.findall(br'(?ms)\\v.*?\\v', pml):
m = re.search(br'TITLE="(.*?)"', comment)
if m:
mi.title = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
mi.title = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
m = re.search(br'AUTHOR="(.*?)"', comment)
if m:
if mi.authors == [_('Unknown')]:
mi.authors = []
mi.authors.append(re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))))
mi.authors.append(re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))))
m = re.search(br'PUBLISHER="(.*?)"', comment)
if m:
mi.publisher = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
mi.publisher = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
m = re.search(br'COPYRIGHT="(.*?)"', comment)
if m:
mi.rights = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
mi.rights = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
m = re.search(br'ISBN="(.*?)"', comment)
if m:
mi.isbn = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
mi.isbn = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
return mi

View File

@ -31,7 +31,7 @@ def get_metadata(stream, extract_cover=True):
mdata = mdata[:1024]
mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata)
mo = re.search(r'(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata)
if mo is not None:
mi.title = mo.group('title')
mi.authors = mo.group('author').split(',')

View File

@ -393,8 +393,7 @@ class MobiReader:
self.processed_html = self.processed_html.replace('</html>', '')
def remove_random_bytes(self, html):
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07',
'', html)
return re.sub(r'\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07', '', html)
def ensure_unit(self, raw, unit='px'):
if re.search(r'\d+$', raw) is not None:

View File

@ -1340,7 +1340,7 @@ class EpubContainer(Container):
break
if raw_unique_identifier is not None:
idpf_key = raw_unique_identifier
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
idpf_key = re.sub(r'[\u0020\u0009\u000d\u000a]', '', idpf_key)
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
return package_id, raw_unique_identifier, idpf_key

View File

@ -124,8 +124,8 @@ class Reader132(FormatReader):
if self.header_record.footnote_count > 0:
html += '<br /><h1>%s</h1>' % _('Footnotes')
footnoteids = re.findall(
'\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
footnoteids = re.findall(r'\w+(?=\x00)',
self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
self.log.debug('Extracting footnote page %i' % i)
if fid < len(footnoteids):
@ -136,8 +136,8 @@ class Reader132(FormatReader):
if self.header_record.sidebar_count > 0:
html += '<br /><h1>%s</h1>' % _('Sidebar')
sidebarids = re.findall(
'\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
sidebarids = re.findall(r'\w+(?=\x00)',
self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
self.log.debug('Extracting sidebar page %i' % i)
if sid < len(sidebarids):

View File

@ -32,7 +32,7 @@ class PdbHeaderReader:
def name(self):
self.stream.seek(0)
return re.sub(b'[^-A-Za-z0-9 ]+', b'_', self.stream.read(32).replace(b'\x00', b''))
return re.sub(br'[^-A-Za-z0-9 ]+', b'_', self.stream.read(32).replace(b'\x00', b''))
def full_section_info(self, number):
if not (0 <= number < self.num_sections):
@ -70,7 +70,7 @@ class PdbHeaderBuilder:
self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8')
if isinstance(title, str):
title = title.encode('ascii', 'replace')
self.title = b'%s\x00' % re.sub(b'[^-A-Za-z0-9 ]+', b'_', title).ljust(31, b'\x00')[:31]
self.title = b'%s\x00' % re.sub(br'[^-A-Za-z0-9 ]+', b'_', title).ljust(31, b'\x00')[:31]
def build_header(self, section_lengths, out_stream):
'''

View File

@ -94,7 +94,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
raw = xml_replace_entities(raw)
raw = re.sub('[\u00a0\u2029]', ' ', raw)
raw = re.sub(r'[\u00a0\u2029]', ' ', raw)
i.write(raw.encode('utf-8'))

View File

@ -196,7 +196,7 @@ class PMLMLizer:
# Turn all characters that cannot be represented by themself into their
# PML code equivalent
text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)
text = re.sub(r'[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)
# Remove excess spaces at beginning and end of lines
text = re.sub(r'(?m)^[ ]+', '', text)
@ -209,14 +209,14 @@ class PMLMLizer:
text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text)
# Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text)
text = re.sub(r'\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text)
text = re.sub(r'\n{2,}', '\n', text)
# Only indent lines that don't have special formatting
text = re.sub(r'(?imu)^(?P<text>.+)$', lambda mo: mo.group('text')
if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text)
else:
text = re.sub('\n{3,}', '\n\n', text)
text = re.sub(r'\n{3,}', '\n\n', text)
return text

View File

@ -61,8 +61,8 @@ def to_int(x):
def clean(text):
text = re.sub('\\s*\n\\s*', '\n', text)
text = re.sub('[ \t]{2,}', ' ', text)
text = re.sub(r'\s*\n\s*', '\n', text)
text = re.sub(r'[ \t]{2,}', ' ', text)
return text.strip()

View File

@ -199,7 +199,7 @@ class RTFMLizer:
# Remove excessive spaces
text = re.sub(r'[ ]{2,}', ' ', text)
text = re.sub('\t{2,}', '\t', text)
text = re.sub(r'\t{2,}', '\t', text)
text = text.replace('\t ', '\t')
# Remove excessive line breaks

View File

@ -719,7 +719,7 @@ class ProcessTokens:
def divide_num(self, numerator, denominator):
try:
# calibre why ignore negative number? Wrong in case of \fi
numerator = float(re.search('[0-9.\\-]+', numerator).group())
numerator = float(re.search(r'[0-9.\-]+', numerator).group())
except TypeError:
if self.__run_level > 3:
msg = ('No number to process?\nthis indicates that the token \\(\\li\\) \

View File

@ -162,12 +162,12 @@ class SNBMLizer:
# text = re.sub('[ ]{2,}', ' ', text)
# Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text)
text = re.sub(r'\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text)
text = re.sub(r'\n{2,}', '\n', text)
text = re.sub(r'(?imu)^(?=.)', '\t', text)
else:
text = re.sub('\n{3,}', '\n\n', text)
text = re.sub(r'\n{3,}', '\n\n', text)
# Replace spaces at the beginning and end of lines
text = re.sub(r'(?imu)^[ ]+', '', text)

View File

@ -58,7 +58,7 @@ class MarkdownMLizer(OEB2HTML):
# Remove tabs that aren't at the beginning of a line
new_text = []
for l in text.splitlines():
start = re.match('\t+', l)
start = re.match(r'\t+', l)
if start:
start = start.group()
else:
@ -71,7 +71,7 @@ class MarkdownMLizer(OEB2HTML):
text = re.sub(r'(?msu)^[ ]+$', '', text)
# Reduce blank lines
text = re.sub('(?msu)\n{7,}', '\n' * 6, text)
text = re.sub(r'(?msu)\n{7,}', '\n' * 6, text)
# Remove blank lines at beginning and end of document.
text = re.sub(r'^\s*', '', text)

View File

@ -31,7 +31,7 @@ def clean_txt(txt):
txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
# Replace whitespace at the beginning of the line with &nbsp;
txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', '&nbsp;' * 4, txt)
txt = re.sub(r'(?m)(?<=^)([ ]{2,}|\t+)(?=.)', '&nbsp;' * 4, txt)
# Condense redundant spaces
txt = re.sub(r'[ ]{2,}', ' ', txt)
@ -40,7 +40,7 @@ def clean_txt(txt):
txt = re.sub(r'^\s+(?=.)', '', txt)
txt = re.sub(r'(?<=.)\s+$', '', txt)
# Remove excessive line breaks.
txt = re.sub('\n{5,}', '\n\n\n\n', txt)
txt = re.sub(r'\n{5,}', '\n\n\n\n', txt)
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24
txt = clean_ascii_chars(txt)
@ -190,7 +190,7 @@ def separate_paragraphs_single_line(txt):
def separate_paragraphs_print_formatted(txt):
txt = re.sub('(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
txt = re.sub(r'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
return txt

View File

@ -109,16 +109,16 @@ class TextileMLizer(OEB2HTML):
# reduce blank lines
text = re.sub(r'\n{3}', r'\n\np. \n\n', text)
text = re.sub('%\n(p[<>=]{1,2}\\.|p\\.)', r'%\n\n\1', text)
text = re.sub(r'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
# Check span following blank para
text = re.sub(r'\n+ +%', r' %', text)
text = re.sub('p[<>=]{1,2}\\.\n\n?', r'', text)
text = re.sub(r'p[<>=]{1,2}\.\n\n?', '', text)
# blank paragraph
text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text)
# blank paragraph
text = text.replace('\n\xa0', '\np. ')
# blank paragraph
text = re.sub('\np[<>=]{1,2}?\\. \xa0', r'\np. ', text)
text = re.sub(r'\np[<>=]{1,2}?\\. \xa0', r'\np. ', text)
text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
# sort out spaces in tables

View File

@ -123,19 +123,19 @@ class TXTMLizer:
text = text.replace('\f+', ' ')
# Single line paragraph.
text = re.sub('(?<=.)\n(?=.)', ' ', text)
text = re.sub(r'(?<=.)\n(?=.)', ' ', text)
# Remove multiple spaces.
text = re.sub(r'[ ]{2,}', ' ', text)
# Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text)
text = re.sub(r'\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text)
text = re.sub(r'\n{2,}', '\n', text)
text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: '%s\n\n' % mo.group('t'), text)
text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '{}\n\n\n\n\n\n{}'.format(mo.group('b'), mo.group('t')), text)
else:
text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
text = re.sub(r'\n{7,}', '\n\n\n\n\n\n', text)
# Replace spaces at the beginning and end of lines
# We don't replace tabs because those are only added

View File

@ -87,4 +87,4 @@ class Jadecoder(Unidecoder):
text = self.conv.do(text)
except Exception:
pass
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text)
return re.sub(r'[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text)

View File

@ -73,7 +73,7 @@ class Unidecoder:
def decode(self, text):
# Replace characters larger than 127 with their ASCII equivalent.
return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)
return re.sub(r'[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text)
def replace_point(self, codepoint):
'''

View File

@ -733,7 +733,7 @@ class ResultDetails(QWidget):
def render_results(self, results, individual_match=None):
html = []
space_pat = re.compile(r'\s+')
markup_pat = re.compile('\x1d')
markup_pat = re.compile(r'\x1d')
def markup_text(text):
count = 0

View File

@ -433,7 +433,7 @@ def run_gui_(opts, args, app, gui_debug=None):
winutil.prepare_for_restart()
with open(debugfile, 'r+b') as f:
raw = f.read()
raw = re.sub(b'(?<!\r)\n', b'\r\n', raw)
raw = re.sub(br'(?<!\r)\n', br'\r\n', raw)
f.seek(0)
f.truncate()
f.write(raw)

View File

@ -39,7 +39,7 @@ attribute_name_pat = re.compile(r'''[^%s"'/><=]+''' % space_chars)
self_closing_pat = re.compile(r'/\s*>')
unquoted_val_pat = re.compile(r'''[^%s'"=<>`]+''' % space_chars)
cdata_close_pats = {x:re.compile(r'</%s' % x, flags=re.I) for x in cdata_tags}
nbsp_pat = re.compile('[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+') # special spaces and hyphens
nbsp_pat = re.compile(r'[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+') # special spaces and hyphens
NORMAL = 0
IN_OPENING_TAG = 1

View File

@ -119,7 +119,7 @@ class NumberToText: # {{{
self.text = NumberToText(self.number.replace('%',' percent')).text
# Test for decimal
elif re.search('\\.',self.number):
elif '.' in self.number:
if self.verbose:
self.log('Decimal: %s' % self.number)
self.number_as_float = self.number
@ -150,7 +150,7 @@ class NumberToText: # {{{
self.text = NumberToText(self.number_as_float).text
# Test for hybrid e.g., 'K2, 2nd, 10@10'
elif re.search('[\\D]+', self.number):
elif re.search(r'[\D]+', self.number):
if self.verbose:
self.log('Hybrid: %s' % self.number)
# Split the token into number/text

View File

@ -11,7 +11,7 @@ from calibre.utils.html2text import html2text
# Hackish - ignoring sentences ending or beginning in numbers to avoid
# confusion with decimal points.
lost_cr_pat = re.compile('([a-z])([\\.\\?!])([A-Z])')
lost_cr_pat = re.compile(r'([a-z])([\.\?!])([A-Z])')
lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])')
sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe',
re.IGNORECASE)

View File

@ -674,7 +674,7 @@ class CustomColumns:
editable=True, display={}):
if not label:
raise ValueError(_('No label was provided'))
if re.match('^\\w*$', label) is None or not label[0].isalpha() or label.lower() != label:
if re.match(r'^\w*$', label) is None or not label[0].isalpha() or label.lower() != label:
raise ValueError(_('The label must contain only lower case letters, digits and underscores, and start with a letter'))
if datatype not in self.CUSTOM_DATA_TYPES:
raise ValueError('%r is not a supported data type'%datatype)

View File

@ -193,7 +193,7 @@ def load_dictionary(dictionary):
class Dictionaries:
def __init__(self):
self.remove_hyphenation = re.compile('[\u2010-]+')
self.remove_hyphenation = re.compile(r'[\u2010-]+')
self.negative_pat = re.compile(r'-[.\d+]')
self.fix_punctuation_pat = re.compile(r'''[:.]''')
self.dictionaries = {}

View File

@ -2553,7 +2553,7 @@ class BibTeX:
self.ascii_bibtex = True
# This substitution is based on the description of cite key restrictions at
# http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html
self.invalid_cit = re.compile('[ "@\',\\#}{~%&$^]')
self.invalid_cit = re.compile(r'[ "@\',\#}{~%&$^]')
self.upper = re.compile('[' +
string.ascii_uppercase + ']')
self.escape = re.compile(r'[#&%_]')

View File

@ -94,4 +94,4 @@ def unescape(text, rm=False, rchar=''):
if rm:
return rchar # replace by char
return text # leave as is
return re.sub('&#?\\w+;', fixup, text)
return re.sub(r'&#?\w+;', fixup, text)

View File

@ -545,7 +545,7 @@ class RecursiveFetcher:
dsrc = self.fetch_url(iurl)
newbaseurl = dsrc.newurl
if len(dsrc) == 0 or \
len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
len(re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
raise ValueError('No content at URL %r'%iurl)
if callable(self.encoding):
dsrc = self.encoding(dsrc)

View File

@ -60,7 +60,7 @@ def styleFromList(styleName, specArray, spacing, showAllLevels):
displayLevels = 0
listStyle = ListStyle(name=styleName)
numFormatPattern = re.compile(r'([1IiAa])')
cssLengthPattern = re.compile('([^a-z]+)\\s*([a-z]+)?')
cssLengthPattern = re.compile(r'([^a-z]+)\s*([a-z]+)?')
m = cssLengthPattern.search(spacing)
if (m is not None):
cssLengthNum = float(m.group(1))