mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
always use raw-string for regex (manual)
ruff 'RUF039'
This commit is contained in:
parent
ac6912565a
commit
3720de10d2
@ -16,5 +16,5 @@ class Android_com_pl(BasicNewsRecipe):
|
||||
remove_tags_after = [{'class': 'post-content'}]
|
||||
remove_tags = [dict(name='ul', attrs={'class': 'tags small-tags'}), dict(name='a', attrs={'onclick': 'return ss_plugin_loadpopup_js(this);'})]
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'<p>.{,1}</p>', re.DOTALL), lambda match: '')]
|
||||
(re.compile(r'<p>.{,1}</p>', re.DOTALL), lambda match: '')]
|
||||
feeds = [(u'Android', u'http://android.com.pl/feed/')]
|
||||
|
@ -17,8 +17,8 @@ class BenchmarkPl(BasicNewsRecipe):
|
||||
extra_css = 'ul {list-style-type: none;}'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
preprocess_regexps = [(re.compile(u'<h3><span style="font-size: small;"> Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', # noqa: E501
|
||||
re.DOTALL | re.IGNORECASE), lambda match: '</body>'), (re.compile(u'Więcej o .*?</ul>', re.DOTALL | re.IGNORECASE), lambda match: '')]
|
||||
preprocess_regexps = [(re.compile(u'<h3><span style="font-size: small;"> Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', # noqa: E501, RUF039
|
||||
re.DOTALL | re.IGNORECASE), lambda match: '</body>'), (re.compile(u'Więcej o .*?</ul>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: RUF039
|
||||
|
||||
keep_only_tags = [dict(id=['articleHeader', 'articleGallery']), dict(
|
||||
name='div', attrs={'class': ['m_zwykly', 'gallery']}), dict(id='article')]
|
||||
|
@ -41,7 +41,7 @@ class Blic(BasicNewsRecipe):
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
remove_tags_before = dict(name='div', attrs={'id': 'article_info'})
|
||||
remove_tags = [
|
||||
dict(name=['object', 'link', 'meta', 'base', 'object', 'embed'])]
|
||||
|
@ -27,7 +27,7 @@ class CveceZla(BasicNewsRecipe):
|
||||
'comment': description, 'tags': 'igre, muzika, film, blog, Srbija', 'publisher': 'Mehmet Krljic', 'language': language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
|
||||
remove_tags_before = dict(attrs={'class': 'navigation'})
|
||||
remove_tags_after = dict(attrs={'class': 'commentlist'})
|
||||
|
@ -61,7 +61,7 @@ class MediaDaumRecipe(BasicNewsRecipe):
|
||||
lambda match: '<em>'),
|
||||
(re.compile(r'<i>(<br[^>]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
|
||||
lambda match: '<i>'),
|
||||
(re.compile(u'(<br[^>]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\\(c\\))*\\[[^\\]]*(\u24D2|\\(c\\)|\uAE30\uC0AC|\uC778\uAE30[^\\]]*\uB274\uC2A4)[^\\]]*\\].*</div>', re.DOTALL | re.IGNORECASE), # noqa: E501
|
||||
(re.compile(u'(<br[^>]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\\(c\\))*\\[[^\\]]*(\u24D2|\\(c\\)|\uAE30\uC0AC|\uC778\uAE30[^\\]]*\uB274\uC2A4)[^\\]]*\\].*</div>', re.DOTALL | re.IGNORECASE), # noqa: E501, RUF039
|
||||
lambda match: '</div>'),
|
||||
]
|
||||
|
||||
|
@ -42,7 +42,7 @@ class DnevnikCro(BasicNewsRecipe):
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id': 'article'})]
|
||||
|
||||
|
@ -20,8 +20,8 @@ class DziennikWschodni(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa: E501
|
||||
(re.compile(u'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: E501
|
||||
preprocess_regexps = [(re.compile(u'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa: E501, RUF039
|
||||
(re.compile(u'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa: E501, RUF039
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
|
@ -45,9 +45,7 @@ class Esensja(BasicNewsRecipe):
|
||||
'''
|
||||
|
||||
preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''),
|
||||
(re.compile(
|
||||
u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''),
|
||||
]
|
||||
(re.compile(r'(title|alt)="[^"]*?"', re.DOTALL), lambda match: '')]
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://www.esensja.pl/magazyn/')
|
||||
|
@ -23,9 +23,7 @@ class EsensjaRSS(BasicNewsRecipe):
|
||||
remove_javascript = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''),
|
||||
(re.compile(
|
||||
u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''),
|
||||
]
|
||||
(re.compile(r'(title|alt)="[^"]*?"', re.DOTALL), lambda match: '')]
|
||||
remove_attributes = ['style', 'bgcolor', 'alt', 'color']
|
||||
keep_only_tags = [dict(attrs={'class': 'sekcja'}), ]
|
||||
remove_tags_after = dict(id='tekst')
|
||||
|
@ -23,7 +23,7 @@ class FilmWebPl(BasicNewsRecipe):
|
||||
'ul.sep-line > li + li::before {content: " | "} '
|
||||
'ul.inline {padding:0px;} .vertical-align {display: inline-block;}')
|
||||
preprocess_regexps = [(re.compile(r'<body.+?</head>', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags...
|
||||
(re.compile(u'(?:<sup>)?\\(kliknij\\,\\ aby powiększyć\\)(?:</sup>)?', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(u'(?:<sup>)?\\(kliknij\\,\\ aby powiększyć\\)(?:</sup>)?', re.IGNORECASE), lambda m: ''), # noqa: RUF039
|
||||
(re.compile(type(u'')(r'(<br ?/?>\s*?<br ?/?>\s*?)+'), re.IGNORECASE), lambda m: '<br />')
|
||||
]
|
||||
remove_tags = [dict(attrs={'class':['infoParent', 'likeBar',
|
||||
|
@ -34,7 +34,7 @@ class gw_krakow(BasicNewsRecipe):
|
||||
|
||||
# rules for gazeta.pl
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
|
||||
(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')] # noqa: RUF039
|
||||
keep_only_tags = [dict(id='gazeta_article')]
|
||||
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(
|
||||
attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]
|
||||
|
@ -33,7 +33,7 @@ class gw_wawa(BasicNewsRecipe):
|
||||
|
||||
# rules for gazeta.pl
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
|
||||
(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')] # noqa: RUF039
|
||||
keep_only_tags = [dict(id='gazeta_article')]
|
||||
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(
|
||||
attrs={'class': ['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]
|
||||
|
@ -54,11 +54,11 @@ def solve_captcha(captcha):
|
||||
|
||||
# Parse into parts
|
||||
pattern = re.compile(
|
||||
u'(?P<first_component>[0-9]+)?'
|
||||
u'\\s*(?P<operator>[+×−])\\s*'
|
||||
u'(?P<second_component>[0-9]+)'
|
||||
u'\\s*(=)\\s*'
|
||||
u'(?P<result>[0-9]+)?', re.UNICODE)
|
||||
r'(?P<first_component>[0-9]+)?'
|
||||
u'\\s*(?P<operator>[+×−])\\s*' # noqa: RUF039
|
||||
r'(?P<second_component>[0-9]+)'
|
||||
r'\s*(=)\s*'
|
||||
r'(?P<result>[0-9]+)?', re.UNICODE)
|
||||
|
||||
calculationParts = re.search(pattern, numeric_problem)
|
||||
if calculationParts is None:
|
||||
|
@ -16,7 +16,7 @@ class in4(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'<a title="translate into.*?</a>', re.DOTALL), lambda match: '')]
|
||||
(re.compile(r'<a title="translate into.*?</a>', re.DOTALL), lambda match: '')]
|
||||
keep_only_tags = [dict(name='div', attrs={'class': 'left_alone'})]
|
||||
remove_tags_after = dict(name='img', attrs={'title': 'komentarze'})
|
||||
remove_tags = [dict(name='img', attrs={'title': 'komentarze'})]
|
||||
|
@ -29,9 +29,9 @@ class KopalniaWiedzy(BasicNewsRecipe):
|
||||
extra_css = '.topimage {margin-top: 30px}'
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
|
||||
(re.compile(r'<a .* rel="lightboxText" .*><img (.*)></a>'),
|
||||
lambda match: '<img class="topimage" ' + match.group(1) + '>'),
|
||||
(re.compile(u'<br /><br />'),
|
||||
(re.compile(r'<br /><br />'),
|
||||
lambda match: '<br/>')
|
||||
]
|
||||
|
||||
|
@ -66,7 +66,7 @@ class LeMondeAbonne(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class': 'po-copy'})
|
||||
]
|
||||
|
||||
article_id_pattern = re.compile('[0-9]+\\.html')
|
||||
article_id_pattern = re.compile(r'[0-9]+\.html')
|
||||
article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/'
|
||||
|
||||
def get_browser(self):
|
||||
|
@ -43,7 +43,7 @@ class NacionalCro(BasicNewsRecipe):
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
|
||||
remove_tags = [dict(name=['object', 'link', 'embed'])]
|
||||
|
||||
|
@ -11,8 +11,8 @@ class NaTemat(BasicNewsRecipe):
|
||||
description = u'informacje, komentarze, opinie'
|
||||
category = 'news'
|
||||
language = 'pl'
|
||||
preprocess_regexps = [(re.compile(u'Czytaj też\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Zobacz też\\:.*?</a>', re.IGNORECASE), lambda m: ''), # noqa: E501
|
||||
(re.compile(u'Czytaj więcej\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Czytaj również\\:.*?</a>', re.IGNORECASE), lambda m: '')] # noqa: E501
|
||||
preprocess_regexps = [(re.compile(u'Czytaj też\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Zobacz też\\:.*?</a>', re.IGNORECASE), lambda m: ''), # noqa: E501, RUF039
|
||||
(re.compile(u'Czytaj więcej\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Czytaj również\\:.*?</a>', re.IGNORECASE), lambda m: '')] # noqa: E501, RUF039
|
||||
cover_url = 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png'
|
||||
no_stylesheets = True
|
||||
keep_only_tags = [dict(id='main')]
|
||||
|
@ -34,7 +34,7 @@ class NjuzNet(BasicNewsRecipe):
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id': 'entryMeta'}), dict(attrs={'class': 'post'})
|
||||
|
@ -36,7 +36,7 @@ class NoviList_hr(BasicNewsRecipe):
|
||||
p{display: block}
|
||||
'''
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
|
||||
conversion_options = {
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
|
||||
|
@ -35,7 +35,7 @@ class Novosti(BasicNewsRecipe):
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'pretty_print': True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
|
||||
keep_only_tags = [dict(attrs={'class': [
|
||||
'articleTitle', 'articleInfo', 'articleLead', 'singlePhoto fl', 'articleBody']})]
|
||||
|
@ -45,7 +45,7 @@ class Nspm(BasicNewsRecipe):
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'pretty_print': True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
remove_tags = [dict(name=['link', 'script', 'meta', 'base', 'img'])]
|
||||
remove_attributes = ['width', 'height', 'lang', 'xmlns:fb',
|
||||
'xmlns:og', 'vspace', 'hspace', 'type', 'start', 'size']
|
||||
|
@ -31,8 +31,7 @@ class ObservatorulCultural(BasicNewsRecipe):
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(
|
||||
'http://www.observatorcultural.ro/Arhiva*-archive.html')
|
||||
issueTag = soup.find('a', href=re.compile(
|
||||
'observatorcultural.ro\\/Numarul'))
|
||||
issueTag = soup.find('a', href=re.compile(r'observatorcultural.ro/Numarul'))
|
||||
issueURL = issueTag['href']
|
||||
print(issueURL)
|
||||
issueSoup = self.index_to_soup(issueURL)
|
||||
|
@ -34,7 +34,7 @@ class Pescanik(BasicNewsRecipe):
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
remove_tags = [
|
||||
dict(name=['object', 'link', 'meta', 'script', 'iframe', 'embed'])]
|
||||
keep_only_tags = [
|
||||
|
@ -31,9 +31,9 @@ class AdvancedUserRecipe1289939440(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'<p class="perex[^"]*">[^<]*<img[^>]*>',
|
||||
(re.compile(r'<p class="perex[^"]*">[^<]*<img[^>]*>',
|
||||
re.DOTALL), lambda match: '<p class="intro">'),
|
||||
(re.compile(u'<h3><a name="tucnak">Tričko tučňák.*</body>',
|
||||
(re.compile(u'<h3><a name="tucnak">Tričko tučňák.*</body>', # noqa: RUF039
|
||||
re.DOTALL), lambda match: '<!--deleted-->')
|
||||
]
|
||||
|
||||
|
@ -38,7 +38,7 @@ class RTS(BasicNewsRecipe):
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
|
||||
feeds = [
|
||||
|
||||
|
@ -25,4 +25,4 @@ class swiatczytnikow(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class': 'feedflare'})]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'<h3>Czytaj dalej:</h3>'), lambda match: '')]
|
||||
(re.compile(u'<h3>Czytaj dalej:</h3>'), lambda match: '')] # noqa: RUF039
|
||||
|
@ -15,8 +15,8 @@ class Tablety_pl(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
preprocess_regexps = [(re.compile(u'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''),
|
||||
(re.compile(u'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
|
||||
preprocess_regexps = [(re.compile(u'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), # noqa: RUF039
|
||||
(re.compile(u'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')] # noqa: RUF039
|
||||
keep_only_tags = [dict(attrs={'class': ['featured-image', 'article-content clearfix']})]
|
||||
remove_tags = [dict(attrs={'class': ['comments_icon', 'wp-polls', 'entry-comments',
|
||||
'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer', 'social-custom']})]
|
||||
|
@ -27,7 +27,7 @@ class TheCultOfGhoul(BasicNewsRecipe):
|
||||
'comment': description, 'tags': 'film, blog, srbija, strava, uzas', 'publisher': 'Dejan Ognjanovic', 'language': language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
|
||||
feeds = [(u'Posts', u'http://cultofghoul.blogspot.com/feeds/posts/default')]
|
||||
|
||||
|
@ -43,7 +43,7 @@ class VecernjiList(BasicNewsRecipe):
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object', 'link', 'embed']), dict(
|
||||
|
@ -41,7 +41,7 @@ class Vreme(BasicNewsRecipe):
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] # noqa: RUF039
|
||||
remove_tags_before = dict(attrs={'class': 'toc-heading'})
|
||||
remove_tags_after = dict(attrs={'class': 'footer'})
|
||||
|
||||
|
@ -10,8 +10,8 @@ class WNP(BasicNewsRecipe):
|
||||
description = u'Wirtualny Nowy Przemysł'
|
||||
category = 'economy'
|
||||
language = 'pl'
|
||||
preprocess_regexps = [(re.compile(u'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''),
|
||||
(re.compile(u'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')]
|
||||
preprocess_regexps = [(re.compile(u'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''), # noqa: RUF039
|
||||
(re.compile(u'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')] # noqa: RUF039
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
|
@ -48,7 +48,7 @@ class ZeitEPUBAbo(BasicNewsRecipe):
|
||||
|
||||
preprocess_regexps = [
|
||||
# filtering for correct dashes ("Gedankenstrich" and "bis")
|
||||
(re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'),
|
||||
(re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'), # noqa: RUF039
|
||||
(re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number
|
||||
(re.compile(r'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro
|
||||
# fix the number dash number dash for the title image that was broken
|
||||
@ -130,9 +130,9 @@ class ZeitEPUBAbo(BasicNewsRecipe):
|
||||
(re.compile(
|
||||
r'(?<=<p class="absatz">[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''),
|
||||
# before closing quotation
|
||||
(re.compile(u' \u00AB'), lambda match: u'\u00AB '),
|
||||
(re.compile(u' \u00AB'), lambda match: u'\u00AB '), # noqa: RUF039
|
||||
# after opening quotation
|
||||
(re.compile(u'\u00BB '), lambda match: u' \u00BB'),
|
||||
(re.compile(u'\u00BB '), lambda match: u' \u00BB'), # noqa: RUF039
|
||||
# filtering for spaces in large numbers for better readability
|
||||
# end of the number with some character following
|
||||
(re.compile(r'(?<=\d\d)(?=\d\d\d[ ,;\)<\?!-])'),
|
||||
@ -151,25 +151,25 @@ class ZeitEPUBAbo(BasicNewsRecipe):
|
||||
# filtering for unicode characters that are missing on the Kindle,
|
||||
# try to replace them with meaningful work-arounds
|
||||
# subscript-0
|
||||
(re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'),
|
||||
(re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), # noqa: RUF039
|
||||
# subscript-1
|
||||
(re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'),
|
||||
(re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), # noqa: RUF039
|
||||
# subscript-2
|
||||
(re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'),
|
||||
(re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), # noqa: RUF039
|
||||
# subscript-3
|
||||
(re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'),
|
||||
(re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), # noqa: RUF039
|
||||
# subscript-4
|
||||
(re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'),
|
||||
(re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), # noqa: RUF039
|
||||
# subscript-5
|
||||
(re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'),
|
||||
(re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), # noqa: RUF039
|
||||
# subscript-6
|
||||
(re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'),
|
||||
(re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), # noqa: RUF039
|
||||
# subscript-7
|
||||
(re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'),
|
||||
(re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), # noqa: RUF039
|
||||
# subscript-8
|
||||
(re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'),
|
||||
(re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), # noqa: RUF039
|
||||
# subscript-9
|
||||
(re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'),
|
||||
(re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), # noqa: RUF039
|
||||
# always chance CO2
|
||||
(re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'), # CO2
|
||||
# remove *** paragraphs
|
||||
|
@ -21,7 +21,7 @@ class PagebreakPageGenerator(IPageGenerator):
|
||||
''' Determine pages based on the presence of <*pagebreak*/>. '''
|
||||
html = mobi_html(mobi_file_path)
|
||||
pages = []
|
||||
for m in re.finditer(b'<[^>]*pagebreak[^>]*>', html):
|
||||
for m in re.finditer(br'<[^>]*pagebreak[^>]*>', html):
|
||||
pages.append(m.end())
|
||||
|
||||
return Pages(pages)
|
||||
|
@ -32,7 +32,7 @@ class TCRCompressor:
|
||||
The intent is to create more unused codes.
|
||||
'''
|
||||
possible_codes = []
|
||||
a_code = set(re.findall(b'(?ms).', self.coded_txt))
|
||||
a_code = set(re.findall(br'(?ms).', self.coded_txt))
|
||||
|
||||
for code in a_code:
|
||||
single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt))
|
||||
@ -57,7 +57,7 @@ class TCRCompressor:
|
||||
'''
|
||||
Create new codes from codes that occur in pairs often.
|
||||
'''
|
||||
possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt)))
|
||||
possible_new_codes = list(set(re.findall(br'(?ms)..', self.coded_txt)))
|
||||
new_codes_count = []
|
||||
|
||||
for c in possible_new_codes:
|
||||
@ -74,7 +74,7 @@ class TCRCompressor:
|
||||
def compress(self, txt):
|
||||
self._reset()
|
||||
|
||||
self.codes = list(set(re.findall(b'(?ms).', txt)))
|
||||
self.codes = list(set(re.findall(br'(?ms).', txt)))
|
||||
|
||||
# Replace the text with their corresponding code
|
||||
# FIXME: python3 is native bytearray, but all we want are bytes
|
||||
|
@ -46,7 +46,7 @@ class EPUBInput(InputFormatPlugin):
|
||||
from lxml import etree
|
||||
idpf_key = opf.raw_unique_identifier
|
||||
if idpf_key:
|
||||
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
|
||||
idpf_key = re.sub(r'[\u0020\u0009\u000d\u000a]', '', idpf_key)
|
||||
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
|
||||
key = None
|
||||
for item in opf.identifier_iter():
|
||||
|
@ -503,7 +503,7 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
tag.tag = XHTML('div')
|
||||
|
||||
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
|
||||
special_chars = re.compile('[\u200b\u00ad]')
|
||||
special_chars = re.compile(r'[\u200b\u00ad]')
|
||||
for elem in root.iterdescendants('*'):
|
||||
if elem.text:
|
||||
elem.text = special_chars.sub('', elem.text)
|
||||
|
@ -296,7 +296,7 @@ class RTFInput(InputFormatPlugin):
|
||||
res = as_bytes(transform.tostring(result))
|
||||
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||
# clean multiple \n
|
||||
res = re.sub(b'\n+', b'\n', res)
|
||||
res = re.sub(br'\n+', b'\n', res)
|
||||
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
||||
# res = re.sub('\s*<body>', '<body>', res)
|
||||
# res = re.sub('(?<=\n)\n{2}',
|
||||
|
@ -94,7 +94,7 @@ class DocAnalysis:
|
||||
elif format == 'spanned_html':
|
||||
linere = re.compile(r'(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
elif format == 'txt':
|
||||
linere = re.compile('.*?\n')
|
||||
linere = re.compile(r'.*?\n')
|
||||
self.lines = linere.findall(raw)
|
||||
|
||||
def line_length(self, percent):
|
||||
|
@ -57,8 +57,8 @@ class HeuristicProcessor:
|
||||
' chapters. - ' + str(chap))
|
||||
return '<h2>'+chap+'</h2>\n'
|
||||
else:
|
||||
delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
|
||||
delete_quotes = re.compile('\'"')
|
||||
delete_whitespace = re.compile(r'^\s*(?P<c>.*?)\s*$')
|
||||
delete_quotes = re.compile(r'\'"')
|
||||
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
|
||||
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
@ -109,7 +109,7 @@ class HeuristicProcessor:
|
||||
be marked up to return true.
|
||||
'''
|
||||
htm_end_ere = re.compile(r'</(p|div)>', re.DOTALL)
|
||||
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
|
||||
line_end_ere = re.compile(r'(\n|\r|\r\n)', re.DOTALL)
|
||||
htm_end = htm_end_ere.findall(raw)
|
||||
line_end = line_end_ere.findall(raw)
|
||||
tot_htm_ends = len(htm_end)
|
||||
@ -417,7 +417,7 @@ class HeuristicProcessor:
|
||||
# Add markup naively
|
||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||
# other types of unmarked html and handle them in some better fashion
|
||||
add_markup = re.compile('(?<!>)(\n)')
|
||||
add_markup = re.compile(r'(?<!>)(\n)')
|
||||
html = add_markup.sub('</p>\n<p>', html)
|
||||
return html
|
||||
|
||||
@ -440,7 +440,7 @@ class HeuristicProcessor:
|
||||
# Get rid of empty <o:p> tags to simplify other processing
|
||||
html = re.sub(r'\s*<o:p>\s*</o:p>', ' ', html)
|
||||
# Delete microsoft 'smart' tags
|
||||
html = re.sub('(?i)</?st1:\\w+>', '', html)
|
||||
html = re.sub(r'(?i)</?st1:\w+>', '', html)
|
||||
# Re-open self closing paragraph tags
|
||||
html = re.sub(r'<p[^>/]*/>', '<p> </p>', html)
|
||||
# Get rid of empty span, bold, font, em, & italics tags
|
||||
@ -451,7 +451,7 @@ class HeuristicProcessor:
|
||||
html = re.sub(
|
||||
r'\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}'.format(open=open_fmt_pat, close=close_fmt_pat) , ' ', html)
|
||||
# delete surrounding divs from empty paragraphs
|
||||
html = re.sub('<div[^>]*>\\s*<p[^>]*>\\s*</p>\\s*</div>', '<p> </p>', html)
|
||||
html = re.sub(r'<div[^>]*>\s*<p[^>]*>\s*</p>\s*</div>', '<p> </p>', html)
|
||||
# Empty heading tags
|
||||
html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
|
||||
self.deleted_nbsps = True
|
||||
@ -538,7 +538,7 @@ class HeuristicProcessor:
|
||||
elif content.find('scenebreak') != -1:
|
||||
return content
|
||||
else:
|
||||
content = re.sub('(?i)<h(?P<hnum>\\d+)[^>]*>', '\n\n<h'+'\\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
|
||||
content = re.sub(r'(?i)<h(?P<hnum>\d+)[^>]*>', r'\n\n<h\g<hnum> style="'+top_margin+bottom_margin+'">', content)
|
||||
return content
|
||||
|
||||
html = blanks_around_headings.sub(merge_header_whitespace, html)
|
||||
@ -551,7 +551,7 @@ class HeuristicProcessor:
|
||||
|
||||
html = blanks_n_nopunct.sub(markup_whitespaces, html)
|
||||
if self.html_preprocess_sections > self.min_chapters:
|
||||
html = re.sub('(?si)^.*?(?=<h\\d)', markup_whitespaces, html)
|
||||
html = re.sub(r'(?si)^.*?(?=<h\d)', markup_whitespaces, html)
|
||||
|
||||
return html
|
||||
|
||||
@ -600,13 +600,13 @@ class HeuristicProcessor:
|
||||
if re.match(r'^<hr', replacement_break):
|
||||
if replacement_break.find('width') != -1:
|
||||
try:
|
||||
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
|
||||
width = int(re.sub(r'.*?width(:|=)(?P<wnum>\d+).*', r'\g<wnum>', replacement_break))
|
||||
except:
|
||||
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||
self.log.warn('Invalid replacement scene break'
|
||||
' expression, using default')
|
||||
else:
|
||||
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
|
||||
replacement_break = re.sub(r'(?i)(width=\d+\\%?|width:\s*\d+(\%|px|pt|em)?;?)', '', replacement_break)
|
||||
divpercent = (100 - width) // 2
|
||||
hr_open = re.sub(r'45', str(divpercent), hr_open)
|
||||
scene_break = hr_open+replacement_break+'</div>'
|
||||
@ -617,24 +617,24 @@ class HeuristicProcessor:
|
||||
else:
|
||||
from calibre.utils.html2text import html2text
|
||||
replacement_break = html2text(replacement_break)
|
||||
replacement_break = re.sub('\\s', ' ', replacement_break)
|
||||
replacement_break = re.sub(r'\s', ' ', replacement_break)
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
else:
|
||||
replacement_break = re.sub('\\s', ' ', replacement_break)
|
||||
replacement_break = re.sub(r'\s', ' ', replacement_break)
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
|
||||
return scene_break
|
||||
|
||||
def check_paragraph(self, content):
|
||||
content = re.sub('\\s*</?span[^>]*>\\s*', '', content)
|
||||
if re.match('.*["\'.!?:]$', content):
|
||||
content = re.sub(r'\s*</?span[^>]*>\s*', '', content)
|
||||
if re.match(r'.*["\'.!?:]$', content):
|
||||
# print('detected this as a paragraph')
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def abbyy_processor(self, html):
|
||||
abbyy_line = re.compile('((?P<linestart><p\\sstyle="(?P<styles>[^"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
|
||||
abbyy_line = re.compile(r'((?P<linestart><p\sstyle="(?P<styles>[^"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
|
||||
empty_paragraph = '\n<p> </p>\n'
|
||||
self.in_blockquote = False
|
||||
self.previous_was_paragraph = False
|
||||
@ -680,7 +680,7 @@ class HeuristicProcessor:
|
||||
if style == 'text-align' and setting != 'left':
|
||||
text_align = style+':'+setting+';'
|
||||
if style == 'text-indent':
|
||||
setting = int(re.sub('\\s*pt\\s*', '', setting))
|
||||
setting = int(re.sub(r'\s*pt\s*', '', setting))
|
||||
if 9 < setting < 14:
|
||||
text_indent = indented_text
|
||||
else:
|
||||
@ -853,7 +853,7 @@ class HeuristicProcessor:
|
||||
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
|
||||
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||
self.log.debug('Formatting scene breaks')
|
||||
html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html)
|
||||
html = re.sub(r'(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html)
|
||||
html = self.detect_scene_breaks(html)
|
||||
html = self.detect_whitespace(html)
|
||||
html = self.detect_soft_breaks(html)
|
||||
@ -870,9 +870,9 @@ class HeuristicProcessor:
|
||||
replacement_break = self.markup_user_break(replacement_break)
|
||||
if scene_break_count >= 1:
|
||||
html = detected_scene_break.sub(replacement_break, html)
|
||||
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
|
||||
html = re.sub(r'<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
|
||||
else:
|
||||
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
|
||||
html = re.sub(r'<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
|
||||
|
||||
if self.deleted_nbsps:
|
||||
# put back non-breaking spaces in empty paragraphs so they render correctly
|
||||
|
@ -62,7 +62,7 @@ class TextRun:
|
||||
self.first_html_parent = first_html_parent
|
||||
if self.ws_pat is None:
|
||||
TextRun.ws_pat = self.ws_pat = re.compile(r'\s+')
|
||||
TextRun.soft_hyphen_pat = self.soft_hyphen_pat = re.compile('(\u00ad)')
|
||||
TextRun.soft_hyphen_pat = self.soft_hyphen_pat = re.compile(r'(\u00ad)')
|
||||
self.style = style
|
||||
self.texts = []
|
||||
self.link = None
|
||||
|
@ -42,21 +42,21 @@ def get_metadata(stream, extract_cover=True):
|
||||
for comment in re.findall(br'(?ms)\\v.*?\\v', pml):
|
||||
m = re.search(br'TITLE="(.*?)"', comment)
|
||||
if m:
|
||||
mi.title = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
|
||||
mi.title = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
|
||||
m = re.search(br'AUTHOR="(.*?)"', comment)
|
||||
if m:
|
||||
if mi.authors == [_('Unknown')]:
|
||||
mi.authors = []
|
||||
mi.authors.append(re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))))
|
||||
mi.authors.append(re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))))
|
||||
m = re.search(br'PUBLISHER="(.*?)"', comment)
|
||||
if m:
|
||||
mi.publisher = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
|
||||
mi.publisher = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
|
||||
m = re.search(br'COPYRIGHT="(.*?)"', comment)
|
||||
if m:
|
||||
mi.rights = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
|
||||
mi.rights = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
|
||||
m = re.search(br'ISBN="(.*?)"', comment)
|
||||
if m:
|
||||
mi.isbn = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
|
||||
mi.isbn = re.sub(r'[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
|
||||
|
||||
return mi
|
||||
|
||||
|
@ -31,7 +31,7 @@ def get_metadata(stream, extract_cover=True):
|
||||
|
||||
mdata = mdata[:1024]
|
||||
|
||||
mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata)
|
||||
mo = re.search(r'(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata)
|
||||
if mo is not None:
|
||||
mi.title = mo.group('title')
|
||||
mi.authors = mo.group('author').split(',')
|
||||
|
@ -393,8 +393,7 @@ class MobiReader:
|
||||
self.processed_html = self.processed_html.replace('</html>', '')
|
||||
|
||||
def remove_random_bytes(self, html):
|
||||
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07',
|
||||
'', html)
|
||||
return re.sub(r'\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07', '', html)
|
||||
|
||||
def ensure_unit(self, raw, unit='px'):
|
||||
if re.search(r'\d+$', raw) is not None:
|
||||
|
@ -1340,7 +1340,7 @@ class EpubContainer(Container):
|
||||
break
|
||||
if raw_unique_identifier is not None:
|
||||
idpf_key = raw_unique_identifier
|
||||
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
|
||||
idpf_key = re.sub(r'[\u0020\u0009\u000d\u000a]', '', idpf_key)
|
||||
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
|
||||
return package_id, raw_unique_identifier, idpf_key
|
||||
|
||||
|
@ -124,8 +124,8 @@ class Reader132(FormatReader):
|
||||
|
||||
if self.header_record.footnote_count > 0:
|
||||
html += '<br /><h1>%s</h1>' % _('Footnotes')
|
||||
footnoteids = re.findall(
|
||||
'\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||
footnoteids = re.findall(r'\w+(?=\x00)',
|
||||
self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
|
||||
self.log.debug('Extracting footnote page %i' % i)
|
||||
if fid < len(footnoteids):
|
||||
@ -136,8 +136,8 @@ class Reader132(FormatReader):
|
||||
|
||||
if self.header_record.sidebar_count > 0:
|
||||
html += '<br /><h1>%s</h1>' % _('Sidebar')
|
||||
sidebarids = re.findall(
|
||||
'\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||
sidebarids = re.findall(r'\w+(?=\x00)',
|
||||
self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
|
||||
self.log.debug('Extracting sidebar page %i' % i)
|
||||
if sid < len(sidebarids):
|
||||
|
@ -32,7 +32,7 @@ class PdbHeaderReader:
|
||||
|
||||
def name(self):
|
||||
self.stream.seek(0)
|
||||
return re.sub(b'[^-A-Za-z0-9 ]+', b'_', self.stream.read(32).replace(b'\x00', b''))
|
||||
return re.sub(br'[^-A-Za-z0-9 ]+', b'_', self.stream.read(32).replace(b'\x00', b''))
|
||||
|
||||
def full_section_info(self, number):
|
||||
if not (0 <= number < self.num_sections):
|
||||
@ -70,7 +70,7 @@ class PdbHeaderBuilder:
|
||||
self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8')
|
||||
if isinstance(title, str):
|
||||
title = title.encode('ascii', 'replace')
|
||||
self.title = b'%s\x00' % re.sub(b'[^-A-Za-z0-9 ]+', b'_', title).ljust(31, b'\x00')[:31]
|
||||
self.title = b'%s\x00' % re.sub(br'[^-A-Za-z0-9 ]+', b'_', title).ljust(31, b'\x00')[:31]
|
||||
|
||||
def build_header(self, section_lengths, out_stream):
|
||||
'''
|
||||
|
@ -94,7 +94,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
||||
raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
|
||||
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
|
||||
raw = xml_replace_entities(raw)
|
||||
raw = re.sub('[\u00a0\u2029]', ' ', raw)
|
||||
raw = re.sub(r'[\u00a0\u2029]', ' ', raw)
|
||||
|
||||
i.write(raw.encode('utf-8'))
|
||||
|
||||
|
@ -196,7 +196,7 @@ class PMLMLizer:
|
||||
|
||||
# Turn all characters that cannot be represented by themself into their
|
||||
# PML code equivalent
|
||||
text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)
|
||||
text = re.sub(r'[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)
|
||||
|
||||
# Remove excess spaces at beginning and end of lines
|
||||
text = re.sub(r'(?m)^[ ]+', '', text)
|
||||
@ -209,14 +209,14 @@ class PMLMLizer:
|
||||
text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text)
|
||||
|
||||
# Remove excessive newlines.
|
||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||
text = re.sub(r'\n[ ]+\n', '\n\n', text)
|
||||
if self.opts.remove_paragraph_spacing:
|
||||
text = re.sub('\n{2,}', '\n', text)
|
||||
text = re.sub(r'\n{2,}', '\n', text)
|
||||
# Only indent lines that don't have special formatting
|
||||
text = re.sub(r'(?imu)^(?P<text>.+)$', lambda mo: mo.group('text')
|
||||
if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text)
|
||||
else:
|
||||
text = re.sub('\n{3,}', '\n\n', text)
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
return text
|
||||
|
||||
|
@ -61,8 +61,8 @@ def to_int(x):
|
||||
|
||||
|
||||
def clean(text):
|
||||
text = re.sub('\\s*\n\\s*', '\n', text)
|
||||
text = re.sub('[ \t]{2,}', ' ', text)
|
||||
text = re.sub(r'\s*\n\s*', '\n', text)
|
||||
text = re.sub(r'[ \t]{2,}', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
|
@ -199,7 +199,7 @@ class RTFMLizer:
|
||||
|
||||
# Remove excessive spaces
|
||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||
text = re.sub('\t{2,}', '\t', text)
|
||||
text = re.sub(r'\t{2,}', '\t', text)
|
||||
text = text.replace('\t ', '\t')
|
||||
|
||||
# Remove excessive line breaks
|
||||
|
@ -719,7 +719,7 @@ class ProcessTokens:
|
||||
def divide_num(self, numerator, denominator):
|
||||
try:
|
||||
# calibre why ignore negative number? Wrong in case of \fi
|
||||
numerator = float(re.search('[0-9.\\-]+', numerator).group())
|
||||
numerator = float(re.search(r'[0-9.\-]+', numerator).group())
|
||||
except TypeError:
|
||||
if self.__run_level > 3:
|
||||
msg = ('No number to process?\nthis indicates that the token \\(\\li\\) \
|
||||
|
@ -162,12 +162,12 @@ class SNBMLizer:
|
||||
# text = re.sub('[ ]{2,}', ' ', text)
|
||||
|
||||
# Remove excessive newlines.
|
||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||
text = re.sub(r'\n[ ]+\n', '\n\n', text)
|
||||
if self.opts.remove_paragraph_spacing:
|
||||
text = re.sub('\n{2,}', '\n', text)
|
||||
text = re.sub(r'\n{2,}', '\n', text)
|
||||
text = re.sub(r'(?imu)^(?=.)', '\t', text)
|
||||
else:
|
||||
text = re.sub('\n{3,}', '\n\n', text)
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
# Replace spaces at the beginning and end of lines
|
||||
text = re.sub(r'(?imu)^[ ]+', '', text)
|
||||
|
@ -58,7 +58,7 @@ class MarkdownMLizer(OEB2HTML):
|
||||
# Remove tabs that aren't at the beginning of a line
|
||||
new_text = []
|
||||
for l in text.splitlines():
|
||||
start = re.match('\t+', l)
|
||||
start = re.match(r'\t+', l)
|
||||
if start:
|
||||
start = start.group()
|
||||
else:
|
||||
@ -71,7 +71,7 @@ class MarkdownMLizer(OEB2HTML):
|
||||
text = re.sub(r'(?msu)^[ ]+$', '', text)
|
||||
|
||||
# Reduce blank lines
|
||||
text = re.sub('(?msu)\n{7,}', '\n' * 6, text)
|
||||
text = re.sub(r'(?msu)\n{7,}', '\n' * 6, text)
|
||||
|
||||
# Remove blank lines at beginning and end of document.
|
||||
text = re.sub(r'^\s*', '', text)
|
||||
|
@ -31,7 +31,7 @@ def clean_txt(txt):
|
||||
txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
|
||||
|
||||
# Replace whitespace at the beginning of the line with
|
||||
txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt)
|
||||
txt = re.sub(r'(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt)
|
||||
|
||||
# Condense redundant spaces
|
||||
txt = re.sub(r'[ ]{2,}', ' ', txt)
|
||||
@ -40,7 +40,7 @@ def clean_txt(txt):
|
||||
txt = re.sub(r'^\s+(?=.)', '', txt)
|
||||
txt = re.sub(r'(?<=.)\s+$', '', txt)
|
||||
# Remove excessive line breaks.
|
||||
txt = re.sub('\n{5,}', '\n\n\n\n', txt)
|
||||
txt = re.sub(r'\n{5,}', '\n\n\n\n', txt)
|
||||
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24
|
||||
txt = clean_ascii_chars(txt)
|
||||
|
||||
@ -190,7 +190,7 @@ def separate_paragraphs_single_line(txt):
|
||||
|
||||
|
||||
def separate_paragraphs_print_formatted(txt):
|
||||
txt = re.sub('(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
|
||||
txt = re.sub(r'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
|
||||
return txt
|
||||
|
||||
|
||||
|
@ -109,16 +109,16 @@ class TextileMLizer(OEB2HTML):
|
||||
|
||||
# reduce blank lines
|
||||
text = re.sub(r'\n{3}', r'\n\np. \n\n', text)
|
||||
text = re.sub('%\n(p[<>=]{1,2}\\.|p\\.)', r'%\n\n\1', text)
|
||||
text = re.sub(r'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
|
||||
# Check span following blank para
|
||||
text = re.sub(r'\n+ +%', r' %', text)
|
||||
text = re.sub('p[<>=]{1,2}\\.\n\n?', r'', text)
|
||||
text = re.sub(r'p[<>=]{1,2}\.\n\n?', '', text)
|
||||
# blank paragraph
|
||||
text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text)
|
||||
# blank paragraph
|
||||
text = text.replace('\n\xa0', '\np. ')
|
||||
# blank paragraph
|
||||
text = re.sub('\np[<>=]{1,2}?\\. \xa0', r'\np. ', text)
|
||||
text = re.sub(r'\np[<>=]{1,2}?\\. \xa0', r'\np. ', text)
|
||||
text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
|
||||
text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
|
||||
# sort out spaces in tables
|
||||
|
@ -123,19 +123,19 @@ class TXTMLizer:
|
||||
text = text.replace('\f+', ' ')
|
||||
|
||||
# Single line paragraph.
|
||||
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
||||
text = re.sub(r'(?<=.)\n(?=.)', ' ', text)
|
||||
|
||||
# Remove multiple spaces.
|
||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||
|
||||
# Remove excessive newlines.
|
||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||
text = re.sub(r'\n[ ]+\n', '\n\n', text)
|
||||
if self.opts.remove_paragraph_spacing:
|
||||
text = re.sub('\n{2,}', '\n', text)
|
||||
text = re.sub(r'\n{2,}', '\n', text)
|
||||
text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: '%s\n\n' % mo.group('t'), text)
|
||||
text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '{}\n\n\n\n\n\n{}'.format(mo.group('b'), mo.group('t')), text)
|
||||
else:
|
||||
text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
|
||||
text = re.sub(r'\n{7,}', '\n\n\n\n\n\n', text)
|
||||
|
||||
# Replace spaces at the beginning and end of lines
|
||||
# We don't replace tabs because those are only added
|
||||
|
@ -87,4 +87,4 @@ class Jadecoder(Unidecoder):
|
||||
text = self.conv.do(text)
|
||||
except Exception:
|
||||
pass
|
||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text)
|
||||
return re.sub(r'[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text)
|
||||
|
@ -73,7 +73,7 @@ class Unidecoder:
|
||||
|
||||
def decode(self, text):
|
||||
# Replace characters larger than 127 with their ASCII equivalent.
|
||||
return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)
|
||||
return re.sub(r'[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text)
|
||||
|
||||
def replace_point(self, codepoint):
|
||||
'''
|
||||
|
@ -733,7 +733,7 @@ class ResultDetails(QWidget):
|
||||
def render_results(self, results, individual_match=None):
|
||||
html = []
|
||||
space_pat = re.compile(r'\s+')
|
||||
markup_pat = re.compile('\x1d')
|
||||
markup_pat = re.compile(r'\x1d')
|
||||
|
||||
def markup_text(text):
|
||||
count = 0
|
||||
|
@ -433,7 +433,7 @@ def run_gui_(opts, args, app, gui_debug=None):
|
||||
winutil.prepare_for_restart()
|
||||
with open(debugfile, 'r+b') as f:
|
||||
raw = f.read()
|
||||
raw = re.sub(b'(?<!\r)\n', b'\r\n', raw)
|
||||
raw = re.sub(br'(?<!\r)\n', br'\r\n', raw)
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
f.write(raw)
|
||||
|
@ -39,7 +39,7 @@ attribute_name_pat = re.compile(r'''[^%s"'/><=]+''' % space_chars)
|
||||
self_closing_pat = re.compile(r'/\s*>')
|
||||
unquoted_val_pat = re.compile(r'''[^%s'"=<>`]+''' % space_chars)
|
||||
cdata_close_pats = {x:re.compile(r'</%s' % x, flags=re.I) for x in cdata_tags}
|
||||
nbsp_pat = re.compile('[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+') # special spaces and hyphens
|
||||
nbsp_pat = re.compile(r'[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+') # special spaces and hyphens
|
||||
|
||||
NORMAL = 0
|
||||
IN_OPENING_TAG = 1
|
||||
|
@ -119,7 +119,7 @@ class NumberToText: # {{{
|
||||
self.text = NumberToText(self.number.replace('%',' percent')).text
|
||||
|
||||
# Test for decimal
|
||||
elif re.search('\\.',self.number):
|
||||
elif '.' in self.number:
|
||||
if self.verbose:
|
||||
self.log('Decimal: %s' % self.number)
|
||||
self.number_as_float = self.number
|
||||
@ -150,7 +150,7 @@ class NumberToText: # {{{
|
||||
self.text = NumberToText(self.number_as_float).text
|
||||
|
||||
# Test for hybrid e.g., 'K2, 2nd, 10@10'
|
||||
elif re.search('[\\D]+', self.number):
|
||||
elif re.search(r'[\D]+', self.number):
|
||||
if self.verbose:
|
||||
self.log('Hybrid: %s' % self.number)
|
||||
# Split the token into number/text
|
||||
|
@ -11,7 +11,7 @@ from calibre.utils.html2text import html2text
|
||||
|
||||
# Hackish - ignoring sentences ending or beginning in numbers to avoid
|
||||
# confusion with decimal points.
|
||||
lost_cr_pat = re.compile('([a-z])([\\.\\?!])([A-Z])')
|
||||
lost_cr_pat = re.compile(r'([a-z])([\.\?!])([A-Z])')
|
||||
lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])')
|
||||
sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe',
|
||||
re.IGNORECASE)
|
||||
|
@ -674,7 +674,7 @@ class CustomColumns:
|
||||
editable=True, display={}):
|
||||
if not label:
|
||||
raise ValueError(_('No label was provided'))
|
||||
if re.match('^\\w*$', label) is None or not label[0].isalpha() or label.lower() != label:
|
||||
if re.match(r'^\w*$', label) is None or not label[0].isalpha() or label.lower() != label:
|
||||
raise ValueError(_('The label must contain only lower case letters, digits and underscores, and start with a letter'))
|
||||
if datatype not in self.CUSTOM_DATA_TYPES:
|
||||
raise ValueError('%r is not a supported data type'%datatype)
|
||||
|
@ -193,7 +193,7 @@ def load_dictionary(dictionary):
|
||||
class Dictionaries:
|
||||
|
||||
def __init__(self):
|
||||
self.remove_hyphenation = re.compile('[\u2010-]+')
|
||||
self.remove_hyphenation = re.compile(r'[\u2010-]+')
|
||||
self.negative_pat = re.compile(r'-[.\d+]')
|
||||
self.fix_punctuation_pat = re.compile(r'''[:.]''')
|
||||
self.dictionaries = {}
|
||||
|
@ -2553,7 +2553,7 @@ class BibTeX:
|
||||
self.ascii_bibtex = True
|
||||
# This substitution is based on the description of cite key restrictions at
|
||||
# http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html
|
||||
self.invalid_cit = re.compile('[ "@\',\\#}{~%&$^]')
|
||||
self.invalid_cit = re.compile(r'[ "@\',\#}{~%&$^]')
|
||||
self.upper = re.compile('[' +
|
||||
string.ascii_uppercase + ']')
|
||||
self.escape = re.compile(r'[#&%_]')
|
||||
|
@ -94,4 +94,4 @@ def unescape(text, rm=False, rchar=''):
|
||||
if rm:
|
||||
return rchar # replace by char
|
||||
return text # leave as is
|
||||
return re.sub('&#?\\w+;', fixup, text)
|
||||
return re.sub(r'&#?\w+;', fixup, text)
|
||||
|
@ -545,7 +545,7 @@ class RecursiveFetcher:
|
||||
dsrc = self.fetch_url(iurl)
|
||||
newbaseurl = dsrc.newurl
|
||||
if len(dsrc) == 0 or \
|
||||
len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
|
||||
len(re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
|
||||
raise ValueError('No content at URL %r'%iurl)
|
||||
if callable(self.encoding):
|
||||
dsrc = self.encoding(dsrc)
|
||||
|
@ -60,7 +60,7 @@ def styleFromList(styleName, specArray, spacing, showAllLevels):
|
||||
displayLevels = 0
|
||||
listStyle = ListStyle(name=styleName)
|
||||
numFormatPattern = re.compile(r'([1IiAa])')
|
||||
cssLengthPattern = re.compile('([^a-z]+)\\s*([a-z]+)?')
|
||||
cssLengthPattern = re.compile(r'([^a-z]+)\s*([a-z]+)?')
|
||||
m = cssLengthPattern.search(spacing)
|
||||
if (m is not None):
|
||||
cssLengthNum = float(m.group(1))
|
||||
|
Loading…
x
Reference in New Issue
Block a user