mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-07 09:01:38 -04:00
always use raw-string for regex (auto-fix)
ruff 'RUF039'
This commit is contained in:
parent
567a0187f3
commit
ac6912565a
@ -49,7 +49,7 @@ def merge():
|
||||
clone_node(child, symbol)
|
||||
ans.append(symbol)
|
||||
ans = etree.tostring(ans, encoding='unicode', pretty_print=True, with_tail=False)
|
||||
ans = re.sub('<svg[^>]+>', '<svg style="display:none">', ans, count=1)
|
||||
ans = re.sub(r'<svg[^>]+>', '<svg style="display:none">', ans, count=1)
|
||||
return ans
|
||||
|
||||
|
||||
|
@ -29,6 +29,6 @@ class AlejaKomiksu(BasicNewsRecipe):
|
||||
def skip_ad_pages(self, soup):
|
||||
tag = soup.find(attrs={'class': 'rodzaj'})
|
||||
if tag and tag.a.string.lower().strip() == 'recenzje':
|
||||
link = soup.find(text=re.compile('recenzuje'))
|
||||
link = soup.find(text=re.compile(r'recenzuje'))
|
||||
if link:
|
||||
return self.index_to_soup(link.parent['href'], raw=True)
|
||||
|
@ -63,12 +63,12 @@ class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
dict(
|
||||
attrs={'class': ['socialbar', 'social-sharing flank', 'vel', 'back']}),
|
||||
dict(name='img', attrs={'alt': 'logo'}),
|
||||
dict(name='div', attrs={'class': re.compile('infoEl')}),
|
||||
dict(name='span', attrs={'class': re.compile('loupe')})
|
||||
dict(name='div', attrs={'class': re.compile(r'infoEl')}),
|
||||
dict(name='span', attrs={'class': re.compile(r'loupe')})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'itemprop': re.compile('articleBody')})
|
||||
dict(name='div', attrs={'itemprop': re.compile(r'articleBody')})
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
@ -58,7 +58,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.birminghammail.co.uk')
|
||||
cov = soup.find(attrs={'src': re.compile(
|
||||
'http://images.icnetwork.co.uk/upl/birm')})
|
||||
r'http://images.icnetwork.co.uk/upl/birm')})
|
||||
cov = str(cov)
|
||||
cov2 = re.findall(
|
||||
r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
||||
|
@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||
|
||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||
@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
name='div', attrs={'class': 'copyright'}),
|
||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||
dict(name='div', attrs={'id': 'soundoff'}),
|
||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
||||
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||
|
||||
def get_cover_url(self):
|
||||
|
@ -39,12 +39,12 @@ class CSMonitor(BasicNewsRecipe):
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['meta', 'link', 'iframe', 'object', 'embed']), dict(attrs={'class': re.compile('(^|| )podStoryRel($|| )', re.DOTALL)}), dict(
|
||||
dict(name=['meta', 'link', 'iframe', 'object', 'embed']), dict(attrs={'class': re.compile(r'(^|| )podStoryRel($|| )', re.DOTALL)}), dict(
|
||||
attrs={'class': ['bottom-rel', 'hide']}), dict(attrs={'id': ['pgallerycarousel_enlarge', 'pgallerycarousel_related']})
|
||||
]
|
||||
keep_only_tags = [
|
||||
dict(name='h1', attrs={'class': 'head'}), dict(name='h2', attrs={'class': 'subhead'}), dict(attrs={'class': [
|
||||
'sByline', 'thePhoto', 'ui-body-header']}), dict(attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)})
|
||||
'sByline', 'thePhoto', 'ui-body-header']}), dict(attrs={'class': re.compile(r'(^|| )sBody($|| )', re.DOTALL)})
|
||||
]
|
||||
remove_attributes = ['xmlns:fb']
|
||||
|
||||
@ -74,11 +74,11 @@ class CSMonitor(BasicNewsRecipe):
|
||||
nurl = 'http://www.csmonitor.com' + nexttag['href']
|
||||
soup2 = self.index_to_soup(nurl)
|
||||
texttag = soup2.find(
|
||||
attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)})
|
||||
attrs={'class': re.compile(r'(^|| )sBody($|| )', re.DOTALL)})
|
||||
if texttag:
|
||||
appendtag = soup.find(
|
||||
attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)})
|
||||
for citem in texttag.findAll(attrs={'class': [re.compile('(^|| )podStoryRel($|| )', re.DOTALL), 'bottom-rel', 'hide']}):
|
||||
attrs={'class': re.compile(r'(^|| )sBody($|| )', re.DOTALL)})
|
||||
for citem in texttag.findAll(attrs={'class': [re.compile(r'(^|| )podStoryRel($|| )', re.DOTALL), 'bottom-rel', 'hide']}):
|
||||
citem.extract()
|
||||
self.append_page(soup2)
|
||||
texttag.extract()
|
||||
|
@ -47,7 +47,7 @@ class Chronicle(BasicNewsRecipe):
|
||||
|
||||
# Find cover
|
||||
cover = soup0.find('div', attrs={
|
||||
'class': 'side-content'}).find(attrs={'src': re.compile('photos/biz/Current')})
|
||||
'class': 'side-content'}).find(attrs={'src': re.compile(r'photos/biz/Current')})
|
||||
if cover is not None:
|
||||
if 'chronicle.com' in cover['src']:
|
||||
self.cover_url = cover['src']
|
||||
|
@ -86,7 +86,7 @@ class CourrierInternational(BasicNewsRecipe):
|
||||
return br
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for link in soup.findAll('a', href=re.compile('^/')):
|
||||
for link in soup.findAll('a', href=re.compile(r'^/')):
|
||||
link['href'] = 'http://www.courrierinternational.com' + link['href']
|
||||
return soup
|
||||
|
||||
|
@ -71,10 +71,10 @@ class AdvancedUserRecipe1467571059(BasicNewsRecipe):
|
||||
remove_tags = [
|
||||
dict(name=['embed', 'object']),
|
||||
dict(name='div', attrs={'class':['note NotePortrait', 'note']}),
|
||||
dict(name='ul', attrs={'class':re.compile('article__share')}),
|
||||
dict(name='ul', attrs={'class':re.compile(r'article__share')}),
|
||||
dict(name='div', attrs={'class':'slideshow__controls'}),
|
||||
dict(name='a', attrs={'role':'button'}),
|
||||
dict(name='figure', attrs={'class':re.compile('video')})
|
||||
dict(name='figure', attrs={'class':re.compile(r'video')})
|
||||
]
|
||||
|
||||
remove_attributes = ['width', 'height']
|
||||
|
@ -31,9 +31,9 @@ class deredactie(BasicNewsRecipe):
|
||||
catnames = {}
|
||||
soup = self.index_to_soup(
|
||||
'http://www.deredactie.be/cm/vrtnieuws.deutsch')
|
||||
for elem in soup.findAll('li', attrs={'id': re.compile('^navItem[2-9]')}):
|
||||
for elem in soup.findAll('li', attrs={'id': re.compile(r'^navItem[2-9]')}):
|
||||
a = elem.find('a', href=True)
|
||||
m = re.search('(?<=/)[^/]*$', a['href'])
|
||||
m = re.search(r'(?<=/)[^/]*$', a['href'])
|
||||
cat = str(m.group(0))
|
||||
categories.append(cat)
|
||||
catnames[cat] = a['title']
|
||||
@ -45,7 +45,7 @@ class deredactie(BasicNewsRecipe):
|
||||
articles = []
|
||||
soup = self.index_to_soup(
|
||||
'http://www.deredactie.be/cm/vrtnieuws.deutsch/' + cat)
|
||||
for a in soup.findAll('a', attrs={'href': re.compile('deutsch.*/[0-9][0-9][0-9][0-9][0-9][0-9]_')}):
|
||||
for a in soup.findAll('a', attrs={'href': re.compile(r'deutsch.*/[0-9][0-9][0-9][0-9][0-9][0-9]_')}):
|
||||
skip_this_article = False
|
||||
url = a['href'].strip()
|
||||
if url.startswith('/'):
|
||||
|
@ -51,7 +51,7 @@ class Donga(BasicNewsRecipe):
|
||||
# https://www.donga.com/news/[sections]/article/all/[date]/[gid]/1
|
||||
# Return print version url with syntax:
|
||||
# https://www.donga.com/news/View?gid=[gid]&date=[date]
|
||||
reobject = re.search('(?<=/all/)([0-9]*)/([0-9]*)', url)
|
||||
reobject = re.search(r'(?<=/all/)([0-9]*)/([0-9]*)', url)
|
||||
date = reobject.group(1)
|
||||
gid = reobject.group(2)
|
||||
|
||||
|
@ -33,7 +33,7 @@ class dwutygodnik(BasicNewsRecipe):
|
||||
browser.open('http://www.dwutygodnik.com/')
|
||||
|
||||
# find the link
|
||||
epublink = browser.find_link(text_regex=re.compile('Wydanie EPUB'))
|
||||
epublink = browser.find_link(text_regex=re.compile(r'Wydanie EPUB'))
|
||||
|
||||
# download ebook
|
||||
self.report_progress(0, _('Downloading ePUB'))
|
||||
|
@ -21,8 +21,8 @@ class Dziennik_pl(BasicNewsRecipe):
|
||||
remove_empty_feeds = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
extra_css = 'ul {list-style: none; padding: 0; margin: 0;} .foto {float: left;} .clr {clear: both;}'
|
||||
preprocess_regexps = [(re.compile('Komentarze:'), lambda m: ''), (re.compile(
|
||||
'<p><strong><a href=".*?">>>> CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
|
||||
preprocess_regexps = [(re.compile(r'Komentarze:'), lambda m: ''), (re.compile(
|
||||
r'<p><strong><a href=".*?">>>> CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
|
||||
keep_only_tags = [dict(id='article')]
|
||||
remove_tags = [dict(name='div', attrs={'class': ['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class': ['komentarz', 'article_icon_addcommnent']}), dict(name='ins'), dict(name='br')] # noqa: E501
|
||||
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
|
||||
|
@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||
|
||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||
@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
name='div', attrs={'class': 'copyright'}),
|
||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||
dict(name='div', attrs={'id': 'soundoff'}),
|
||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
||||
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||
|
||||
def get_cover_url(self):
|
||||
|
@ -51,7 +51,7 @@ class Esensja(BasicNewsRecipe):
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://www.esensja.pl/magazyn/')
|
||||
a = soup.find('a', attrs={'href': re.compile('.*/index.html')})
|
||||
a = soup.find('a', attrs={'href': re.compile(r'.*/index.html')})
|
||||
year = a['href'].split('/')[0]
|
||||
month = a['href'].split('/')[1]
|
||||
self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/'
|
||||
@ -149,7 +149,7 @@ class Esensja(BasicNewsRecipe):
|
||||
info = tag.find(attrs={'class': 'img_info'})
|
||||
text = str(tag)
|
||||
if not src:
|
||||
src = re.search('src="[^"]*?"', text)
|
||||
src = re.search(r'src="[^"]*?"', text)
|
||||
if src:
|
||||
src = src.group(0)
|
||||
src = src[5:].replace('//', '/')
|
||||
|
@ -95,7 +95,7 @@ class EsensjaRSS(BasicNewsRecipe):
|
||||
info = tag.find(attrs={'class': 'img_info'})
|
||||
text = str(tag)
|
||||
if not src:
|
||||
src = re.search('src="[^"]*?"', text)
|
||||
src = re.search(r'src="[^"]*?"', text)
|
||||
if src:
|
||||
src = src.group(0)
|
||||
src = src[5:].replace('//', '/')
|
||||
|
@ -109,7 +109,7 @@ img { background: none !important; float: none; margin: 0px; }
|
||||
|
||||
for post in soup.findAll('a'):
|
||||
strpost = str(post)
|
||||
if re.match('<a href="https://www1.folha.uol.com.br/.*/"><svg aria-hidden="true" class="icon icon--star"', strpost):
|
||||
if re.match(r'<a href="https://www1.folha.uol.com.br/.*/"><svg aria-hidden="true" class="icon icon--star"', strpost):
|
||||
if articles:
|
||||
feeds.append((section_title, articles))
|
||||
self.log()
|
||||
|
@ -39,7 +39,7 @@ class AdvancedUserRecipe1515196393(BasicNewsRecipe):
|
||||
feeds = []
|
||||
br = self.get_browser()
|
||||
self.ctdir = PersistentTemporaryDirectory()
|
||||
for x in toc.findAll(['li'], attrs={'class': re.compile('.*get_content.*')}):
|
||||
for x in toc.findAll(['li'], attrs={'class': re.compile(r'.*get_content.*')}):
|
||||
edwo = x.find('a')
|
||||
title = self.tag_to_string(edwo)
|
||||
self.log('\t\tFound article:', title)
|
||||
|
@ -54,7 +54,7 @@ class GN(BasicNewsRecipe):
|
||||
}]
|
||||
feeds.append((u'Na dobry początek', articles))
|
||||
# columns:
|
||||
for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}):
|
||||
for addr in soup.findAll('a', attrs={'href': re.compile(r'kategoria')}):
|
||||
if not addr.span:
|
||||
main_block = self.index_to_soup(
|
||||
'http://www.gosc.pl' + addr['href'])
|
||||
|
@ -50,7 +50,7 @@ class GN(BasicNewsRecipe):
|
||||
}]
|
||||
feeds.append((u'Na dobry początek', articles))
|
||||
# columns:
|
||||
for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}):
|
||||
for addr in soup.findAll('a', attrs={'href': re.compile(r'kategoria')}):
|
||||
if not addr.span:
|
||||
main_block = self.index_to_soup(
|
||||
'http://www.gosc.pl' + addr['href'])
|
||||
|
@ -50,10 +50,10 @@ class GazetvanAntwerpen(BasicNewsRecipe):
|
||||
remove_tags = [
|
||||
dict(name=['embed', 'object']),
|
||||
dict(name='div', attrs={'class': ['note NotePortrait', 'note']}),
|
||||
dict(name='ul', attrs={'class': re.compile('article__share')}),
|
||||
dict(name='ul', attrs={'class': re.compile(r'article__share')}),
|
||||
dict(name='div', attrs={'class': 'slideshow__controls'}),
|
||||
dict(name='a', attrs={'role': 'button'}),
|
||||
dict(name='figure', attrs={'class': re.compile('video')})
|
||||
dict(name='figure', attrs={'class': re.compile(r'video')})
|
||||
]
|
||||
|
||||
remove_attributes = ['width', 'height']
|
||||
|
@ -78,7 +78,7 @@ class HNWithCommentsLink(BasicNewsRecipe):
|
||||
br = td.find('br')
|
||||
if br:
|
||||
br.extract()
|
||||
reply = td.find('a', attrs={'href': re.compile('^reply?')})
|
||||
reply = td.find('a', attrs={'href': re.compile(r'^reply?')})
|
||||
if reply:
|
||||
reply.parent.extract()
|
||||
td.name = 'div'
|
||||
|
@ -59,7 +59,7 @@ class Handelsblatt(BasicNewsRecipe):
|
||||
dict(name='aside', attrs={'class': ['vhb-article-element vhb-left',
|
||||
'vhb-article-element vhb-left vhb-teasergallery',
|
||||
'vhb-article-element vhb-left vhb-shorttexts']}),
|
||||
dict(name='aside', attrs={'class': re.compile('vhb-club-events')}),
|
||||
dict(name='aside', attrs={'class': re.compile(r'vhb-club-events')}),
|
||||
dict(name='article', attrs={'class': ['vhb-imagegallery vhb-teaser',
|
||||
'vhb-teaser vhb-type-video']}),
|
||||
dict(name='small', attrs={'class': ['vhb-credit']}),
|
||||
@ -70,14 +70,14 @@ class Handelsblatt(BasicNewsRecipe):
|
||||
'opinary-widget-wrapper',
|
||||
'vhb-article__content-element--shorttextgallery',
|
||||
'vhb-hollow-area vhb-hollow-area--col-1']}),
|
||||
dict(name='div', attrs={'class': re.compile('stepstone')}),
|
||||
dict(name='div', attrs={'class': re.compile('vhb-imagegallery')}),
|
||||
dict(name='div', attrs={'class': re.compile(r'stepstone')}),
|
||||
dict(name='div', attrs={'class': re.compile(r'vhb-imagegallery')}),
|
||||
dict(name='div', attrs={'id': ['highcharts_infografik']}),
|
||||
dict(name='div', attrs={'id': re.compile('dax-sentiment')}),
|
||||
dict(name=['div', 'section'], attrs={'class': re.compile('slider')}),
|
||||
dict(name='div', attrs={'id': re.compile(r'dax-sentiment')}),
|
||||
dict(name=['div', 'section'], attrs={'class': re.compile(r'slider')}),
|
||||
dict(name='a', attrs={'class': ['twitter-follow-button']}),
|
||||
dict(name='img', attrs={'class': ['highlight-icon', 'lb-author__avatar', 'pin-icon']}),
|
||||
dict(name='img', attrs={'alt': re.compile('Handelsblatt Morning Briefing')}),
|
||||
dict(name='img', attrs={'alt': re.compile(r'Handelsblatt Morning Briefing')}),
|
||||
dict(name=['blockquote', 'button', 'link'])
|
||||
]
|
||||
|
||||
@ -138,7 +138,7 @@ class Handelsblatt(BasicNewsRecipe):
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
# convert lists of author(s) and date(s) into simple text
|
||||
for cap in soup.find_all('div', {'class': re.compile('vhb-article-caption')}):
|
||||
for cap in soup.find_all('div', {'class': re.compile(r'vhb-article-caption')}):
|
||||
cap.replace_with(cap.encode_contents().decode('utf-8').strip() + ' ')
|
||||
for row in soup.find_all('div', {'class': 'vhb-article-author-row'}):
|
||||
for ul in row.find_all('ul'):
|
||||
@ -160,7 +160,7 @@ class Handelsblatt(BasicNewsRecipe):
|
||||
fig.find('div', {'class': 'vhb-caption'}).replace_with(cap)
|
||||
# remove references to related articles
|
||||
for strong in soup.find_all('strong'):
|
||||
if strong.string and (re.match('^Mehr:? ?', strong.string) or re.match('^>>.*', strong.string)):
|
||||
if strong.string and (re.match(r'^Mehr:? ?', strong.string) or re.match(r'^>>.*', strong.string)):
|
||||
p_parent = strong.find_parent('p')
|
||||
if p_parent:
|
||||
p_parent.decompose()
|
||||
|
@ -49,7 +49,7 @@ class HistoryToday(BasicNewsRecipe):
|
||||
# Go to issue
|
||||
soup = self.index_to_soup('https://www.historytoday.com/contents')
|
||||
cover = soup.find('div', attrs={
|
||||
'id': 'content-area'}).find('img', attrs={'src': re.compile('.*cover.*')})['src']
|
||||
'id': 'content-area'}).find('img', attrs={'src': re.compile(r'.*cover.*')})['src']
|
||||
self.cover_url = cover
|
||||
self.log(self.cover_url)
|
||||
|
||||
|
@ -89,7 +89,7 @@ class IndiaToday(BasicNewsRecipe):
|
||||
return soup
|
||||
|
||||
def preprocess_raw_html(self, raw, *a):
|
||||
m = re.search('id="__NEXT_DATA__" type="application/json">', raw)
|
||||
m = re.search(r'id="__NEXT_DATA__" type="application/json">', raw)
|
||||
raw = raw[m.start():]
|
||||
raw = raw.split('>', 1)[1]
|
||||
data = json.JSONDecoder().raw_decode(raw)[0]
|
||||
|
@ -36,7 +36,7 @@ class JoopRecipe(BasicNewsRecipe):
|
||||
keep_only_tags.append(
|
||||
dict(name='h2', attrs={'class': 'columnhead smallline'}))
|
||||
keep_only_tags.append(
|
||||
dict(name='div', attrs={'class': re.compile('article.*')}))
|
||||
dict(name='div', attrs={'class': re.compile(r'article.*')}))
|
||||
|
||||
extra_css = '''
|
||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
|
||||
|
@ -44,16 +44,16 @@ class Kurier(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='article', attrs={'class': re.compile('main-article')})
|
||||
dict(name='article', attrs={'class': re.compile(r'main-article')})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class': 'social-media-container'}),
|
||||
dict(name='section', attrs={'class': 'tags'}),
|
||||
dict(name='section', attrs={'class': re.compile('comment-box')}),
|
||||
dict(name='section', attrs={'class': re.compile('related-content')}),
|
||||
dict(name='section', attrs={'class': re.compile('article-slider')}),
|
||||
dict(name='section', attrs={'class': re.compile('commentcontainer')}),
|
||||
dict(name='section', attrs={'class': re.compile(r'comment-box')}),
|
||||
dict(name='section', attrs={'class': re.compile(r'related-content')}),
|
||||
dict(name='section', attrs={'class': re.compile(r'article-slider')}),
|
||||
dict(name='section', attrs={'class': re.compile(r'commentcontainer')}),
|
||||
dict(name='blockquote')
|
||||
]
|
||||
|
||||
|
@ -21,7 +21,7 @@ class Kyungyhang(BasicNewsRecipe):
|
||||
remove_javascript = True
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile("<div class='ad_movFocus'.*</html>",
|
||||
(re.compile(r"<div class='ad_movFocus'.*</html>",
|
||||
re.DOTALL | re.IGNORECASE), lambda match: '</html>'),
|
||||
]
|
||||
|
||||
|
@ -121,7 +121,7 @@ class LeMondeAbonne(BasicNewsRecipe):
|
||||
files = os.listdir(path)
|
||||
|
||||
nb_index_files = len([
|
||||
name for name in files if re.match('frame_gauche_[0-9]+.html', name)
|
||||
name for name in files if re.match(r'frame_gauche_[0-9]+.html', name)
|
||||
])
|
||||
|
||||
flux = []
|
||||
|
@ -144,7 +144,7 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
# Most articles have anchors in their titles, *except* the
|
||||
# security vulnerabilities
|
||||
article_anchor = curr.find(
|
||||
name='a', attrs={'href': re.compile('^/Articles/')})
|
||||
name='a', attrs={'href': re.compile(r'^/Articles/')})
|
||||
|
||||
if article_anchor:
|
||||
article_url = article_anchor.get('href')
|
||||
|
@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||
|
||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||
@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
name='div', attrs={'class': 'copyright'}),
|
||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||
dict(name='div', attrs={'id': 'soundoff'}),
|
||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
||||
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||
|
||||
def get_cover_url(self):
|
||||
|
@ -71,21 +71,21 @@ class Newsweek(BasicNewsRecipe):
|
||||
strong = p.find('strong')
|
||||
if strong:
|
||||
newest = re.compile(
|
||||
'Tekst pochodzi z najnowszego numeru Tygodnika Newsweek')
|
||||
r'Tekst pochodzi z najnowszego numeru Tygodnika Newsweek')
|
||||
if newest.search(str(strong)):
|
||||
strong.extract()
|
||||
continue
|
||||
|
||||
itunes = p.find('a')
|
||||
if itunes:
|
||||
reurl = re.compile('itunes.apple.com')
|
||||
reurl = re.compile(r'itunes.apple.com')
|
||||
if reurl.search(str(itunes['href'])):
|
||||
p.extract()
|
||||
continue
|
||||
|
||||
imagedesc = p.find('div', attrs={'class': 'image-desc'})
|
||||
if imagedesc:
|
||||
redesc = re.compile('Okładka numeru')
|
||||
redesc = re.compile(r'Okładka numeru')
|
||||
if (redesc.search(str(imagedesc))):
|
||||
p.extract()
|
||||
continue
|
||||
|
@ -77,10 +77,10 @@ class NikkeiNet_paper_subscription(BasicNewsRecipe):
|
||||
print('-------------------------get index of paper--------------------------------')
|
||||
result = []
|
||||
soup = self.index_to_soup('http://www.nikkei.com/paper/')
|
||||
sections = soup.findAll(attrs={'class': re.compile('.*cmn-article_title.*')})
|
||||
sections = soup.findAll(attrs={'class': re.compile(r'.*cmn-article_title.*')})
|
||||
|
||||
for sect in sections:
|
||||
sect_title = sect.find(attrs={'class' : re.compile('.*cmnc-((large)|(middle)|(small)).*')})
|
||||
sect_title = sect.find(attrs={'class' : re.compile(r'.*cmnc-((large)|(middle)|(small)).*')})
|
||||
if sect_title is None:
|
||||
continue
|
||||
sect_title = sect_title.contents[0]
|
||||
|
@ -62,7 +62,7 @@ class NRCNext(BasicNewsRecipe):
|
||||
zfile = zipfile.ZipFile(BytesIO(epubraw), 'r')
|
||||
zfile.extractall(self.output_dir)
|
||||
namelist = zfile.namelist()
|
||||
emre = re.compile('<em(?:.*)>(.*)</em>')
|
||||
emre = re.compile(r'<em(?:.*)>(.*)</em>')
|
||||
subst = '\\1'
|
||||
for name in namelist:
|
||||
_, ext = os.path.splitext(name)
|
||||
|
@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||
|
||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||
@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
name='div', attrs={'class': 'copyright'}),
|
||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||
dict(name='div', attrs={'id': 'soundoff'}),
|
||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
||||
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||
|
||||
def get_cover_url(self):
|
||||
|
@ -48,7 +48,7 @@ class outlook(BasicNewsRecipe):
|
||||
return [('Articles', ans)]
|
||||
|
||||
def preprocess_raw_html(self, raw, *a):
|
||||
m = re.search('id="__NEXT_DATA__" type="application/json">', raw)
|
||||
m = re.search(r'id="__NEXT_DATA__" type="application/json">', raw)
|
||||
raw = raw[m.start():]
|
||||
raw = raw.split('>', 1)[1]
|
||||
data = json.JSONDecoder().raw_decode(raw)[0]
|
||||
|
@ -41,9 +41,9 @@ class Polter(BasicNewsRecipe):
|
||||
(u'Konwenty', 'http://konwenty.polter.pl/wiesci,rss.html')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for s in soup.findAll(attrs={'style': re.compile('float: ?left')}):
|
||||
for s in soup.findAll(attrs={'style': re.compile(r'float: ?left')}):
|
||||
s['class'] = 'floatleft'
|
||||
for s in soup.findAll(attrs={'style': re.compile('float: ?right')}):
|
||||
for s in soup.findAll(attrs={'style': re.compile(r'float: ?right')}):
|
||||
s['class'] = 'floatright'
|
||||
for s in soup.findAll(style=True):
|
||||
if 'bold;' in s['style']:
|
||||
|
@ -161,9 +161,9 @@ class PrivateEyeRecipe(BasicNewsRecipe):
|
||||
{'name': 'div', 'attrs': {'id': 'about-covers'}},
|
||||
{'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}},
|
||||
{'name': 'iframe'},
|
||||
{'name': 'link', 'attrs': {'href': re.compile('/javastyle/lightbox/')}},
|
||||
{'name': 'link', 'attrs': {'href': re.compile('/javastyle/news_ticker/')}},
|
||||
{'name': 'link', 'attrs': {'href': re.compile('/javastyle/media-queries-')}},
|
||||
{'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/lightbox/')}},
|
||||
{'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/news_ticker/')}},
|
||||
{'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/media-queries-')}},
|
||||
]
|
||||
|
||||
# Convert headers to h1, strapline to h4
|
||||
|
@ -54,7 +54,7 @@ class ScienceNewsIssue(BasicNewsRecipe):
|
||||
# Get articles
|
||||
soup = self.index_to_soup(url)
|
||||
soup = soup.find('main', attrs={'id':'content'})
|
||||
re_article = re.compile('https://www.sciencenews.org/article/')
|
||||
re_article = re.compile(r'https://www.sciencenews.org/article/')
|
||||
stories = []
|
||||
past_urls = set()
|
||||
for sec in soup.find_all(href=re_article):
|
||||
|
@ -76,8 +76,8 @@ class SolHaberRecipe(BasicNewsRecipe):
|
||||
result = []
|
||||
articles_dict = {}
|
||||
|
||||
author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$')
|
||||
category_regexp = re.compile('^http://.*?/(.+?)/.*$')
|
||||
author_regexp = re.compile(r'^http://.*?/yazarlar/(.*?)/.*$')
|
||||
category_regexp = re.compile(r'^http://.*?/(.+?)/.*$')
|
||||
|
||||
for section_tuple in self.section_tuples:
|
||||
|
||||
|
@ -43,7 +43,7 @@ class StandardMediaKeRecipe(BasicNewsRecipe):
|
||||
|
||||
def print_version(self, url):
|
||||
import re
|
||||
p = re.compile('http://www.standardmedia.co.ke/.*InsidePage.php')
|
||||
p = re.compile(r'http://www.standardmedia.co.ke/.*InsidePage.php')
|
||||
return p.sub('http://www.standardmedia.co.ke/print.php', url)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
@ -89,7 +89,7 @@ class TheAge(BasicNewsRecipe):
|
||||
|
||||
for i in soup.findAll('a'):
|
||||
href = i['href']
|
||||
if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf', href):
|
||||
if href and re.match(r'http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf', href):
|
||||
return href
|
||||
|
||||
return None
|
||||
|
@ -92,7 +92,7 @@ class PrivateEyeRecipe(BasicNewsRecipe):
|
||||
# 1. Title. By author
|
||||
#.2. Title by author: subtitle
|
||||
# 3. Title: author: subtitle
|
||||
title_author_re = re.compile('^(.*?)(?:(?: by )|(?:: ))(.*?): (.*?)$')
|
||||
title_author_re = re.compile(r'^(.*?)(?:(?: by )|(?:: ))(.*?): (.*?)$')
|
||||
|
||||
# Separate author from title (where it is specified)
|
||||
def title_author(self, head):
|
||||
|
@ -38,7 +38,7 @@ class Tweakers(BasicNewsRecipe):
|
||||
'class': ['sidebar', 'advertorial']
|
||||
},
|
||||
{
|
||||
'class': re.compile('nextPrevious')
|
||||
'class': re.compile(r'nextPrevious')
|
||||
},
|
||||
]
|
||||
no_stylesheets = True
|
||||
|
@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||
|
||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||
@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
name='div', attrs={'class': 'copyright'}),
|
||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||
dict(name='div', attrs={'id': 'soundoff'}),
|
||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
||||
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||
|
||||
def get_cover_url(self):
|
||||
|
@ -127,7 +127,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||
|
||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||
@ -141,7 +141,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
name='div', attrs={'class': 'copyright'}),
|
||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||
dict(name='div', attrs={'id': 'soundoff'}),
|
||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
||||
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||
|
||||
def get_cover_url(self):
|
||||
|
@ -82,28 +82,28 @@ class TimesColonist(BasicNewsRecipe):
|
||||
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
'''
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class': re.compile('main.content')})]
|
||||
dict(name='div', attrs={'class': re.compile(r'main.content')})]
|
||||
|
||||
def __init__(self, options, log, progress_reporter):
|
||||
self.remove_tags = [{'class': 'comments'},
|
||||
{'id': 'photocredit'},
|
||||
dict(name='div', attrs={
|
||||
'class': re.compile('top.controls')}),
|
||||
'class': re.compile(r'top.controls')}),
|
||||
dict(name='div', attrs={
|
||||
'class': re.compile('^comments')}),
|
||||
'class': re.compile(r'^comments')}),
|
||||
dict(name='div', attrs={
|
||||
'class': re.compile('social')}),
|
||||
'class': re.compile(r'social')}),
|
||||
dict(name='div', attrs={
|
||||
'class': re.compile('tools')}),
|
||||
'class': re.compile(r'tools')}),
|
||||
dict(name='div', attrs={
|
||||
'class': re.compile('bottom.tools')}),
|
||||
'class': re.compile(r'bottom.tools')}),
|
||||
dict(name='div', attrs={
|
||||
'class': re.compile('window')}),
|
||||
dict(name='div', attrs={'class': re.compile('related.news.element')})]
|
||||
'class': re.compile(r'window')}),
|
||||
dict(name='div', attrs={'class': re.compile(r'related.news.element')})]
|
||||
print('PROFILE NAME = ' + options.output_profile.short_name)
|
||||
if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
|
||||
self.remove_tags.append(
|
||||
dict(name='div', attrs={'class': re.compile('image-container')}))
|
||||
dict(name='div', attrs={'class': re.compile(r'image-container')}))
|
||||
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
|
||||
|
||||
def get_cover_url(self):
|
||||
@ -173,19 +173,19 @@ class TimesColonist(BasicNewsRecipe):
|
||||
return soup
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
byline = soup.find('p', attrs={'class': re.compile('ancillary')})
|
||||
byline = soup.find('p', attrs={'class': re.compile(r'ancillary')})
|
||||
if byline is not None:
|
||||
authstr = self.tag_to_string(byline, False)
|
||||
authstr = re.sub('/ *Times Colonist', '/',
|
||||
authstr = re.sub(r'/ *Times Colonist', '/',
|
||||
authstr, flags=re.IGNORECASE)
|
||||
authstr = re.sub('BY */', '', authstr, flags=re.IGNORECASE)
|
||||
authstr = re.sub(r'BY */', '', authstr, flags=re.IGNORECASE)
|
||||
newdiv = new_tag(soup, 'div')
|
||||
newdiv.insert(0, authstr)
|
||||
newdiv['class'] = 'byline'
|
||||
byline.replaceWith(newdiv)
|
||||
for caption in soup.findAll('p', attrs={'class': re.compile('caption')}):
|
||||
for caption in soup.findAll('p', attrs={'class': re.compile(r'caption')}):
|
||||
capstr = self.tag_to_string(caption, False)
|
||||
capstr = re.sub('Photograph by.*$', '',
|
||||
capstr = re.sub(r'Photograph by.*$', '',
|
||||
capstr, flags=re.IGNORECASE)
|
||||
newdiv = new_tag(soup, 'div')
|
||||
newdiv.insert(0, capstr)
|
||||
@ -239,13 +239,13 @@ class TimesColonist(BasicNewsRecipe):
|
||||
except:
|
||||
return ans
|
||||
mainsoup = soup.find(
|
||||
'div', attrs={'class': re.compile('main.content')})
|
||||
'div', attrs={'class': re.compile(r'main.content')})
|
||||
article_list = []
|
||||
for wdiv in mainsoup.findAll('div', attrs={'id': re.compile('featured.story')}):
|
||||
for wdiv in mainsoup.findAll('div', attrs={'id': re.compile(r'featured.story')}):
|
||||
for htag in wdiv.findAll('h3'):
|
||||
self.handle_articles(htag, article_list, sectitle)
|
||||
for ladiv in mainsoup.findAll(attrs={'class': re.compile('leading.articles')}):
|
||||
for wdiv in mainsoup.findAll('div', attrs={'class': re.compile('article.row')}):
|
||||
for ladiv in mainsoup.findAll(attrs={'class': re.compile(r'leading.articles')}):
|
||||
for wdiv in mainsoup.findAll('div', attrs={'class': re.compile(r'article.row')}):
|
||||
for htag in wdiv.findAll('h2'):
|
||||
self.handle_articles(htag, article_list, sectitle)
|
||||
ans.append((sectitle, article_list))
|
||||
|
@ -139,7 +139,7 @@ class ZeitDe(BasicNewsRecipe):
|
||||
body.insert(0, header)
|
||||
|
||||
# Add real img tags for images
|
||||
for container in soup.findAll(class_=re.compile('__media-container$')):
|
||||
for container in soup.findAll(class_=re.compile(r'__media-container$')):
|
||||
img = container.find('noscript')
|
||||
if img is not None:
|
||||
img.name = 'div'
|
||||
|
@ -200,11 +200,11 @@ class ZeitEPUBAbo(BasicNewsRecipe):
|
||||
# browser.follow_link(abolink)
|
||||
# find page for latest issue
|
||||
latestlink = browser.find_link(text_regex=re.compile(
|
||||
'.*ZUR AKTUELLEN AUSGABE.*'))
|
||||
r'.*ZUR AKTUELLEN AUSGABE.*'))
|
||||
browser.follow_link(latestlink)
|
||||
# now find the correct file, we will still use the ePub file
|
||||
epublink = browser.find_link(text_regex=re.compile(
|
||||
'.*EPUB F.*R E-READER LADEN.*')) # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017
|
||||
r'.*EPUB F.*R E-READER LADEN.*')) # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017
|
||||
response = browser.follow_link(epublink)
|
||||
self.report_progress(1, _('next step'))
|
||||
|
||||
@ -266,11 +266,11 @@ class ZeitEPUBAbo(BasicNewsRecipe):
|
||||
# browser.follow_link(abolink)
|
||||
# find page for latest issue
|
||||
latestlink = browser.find_link(text_regex=re.compile(
|
||||
'.*ZUR AKTUELLEN AUSGABE.*'))
|
||||
r'.*ZUR AKTUELLEN AUSGABE.*'))
|
||||
browser.follow_link(latestlink)
|
||||
# actual cover search
|
||||
pdflink = browser.find_link(text_regex=re.compile(
|
||||
'.*GESAMT-PDF LADEN.*'))
|
||||
r'.*GESAMT-PDF LADEN.*'))
|
||||
cover_url = urlparse(pdflink.base_url)[0] + '://' + urlparse(pdflink.base_url)[1] + '' + (
|
||||
urlparse(pdflink.url)[2]).replace('ePaper_', '').replace('.pdf', '_001.pdf')
|
||||
self.log.warning('PDF link found:')
|
||||
|
@ -34,6 +34,7 @@ select = [
|
||||
# preview rules
|
||||
'RUF051', 'RUF056', # useless dict operation
|
||||
'RUF055', # unnecessary regex
|
||||
'RUF039', # always use raw-string for regex
|
||||
]
|
||||
|
||||
[lint.per-file-ignores]
|
||||
@ -46,7 +47,7 @@ select = [
|
||||
"src/calibre/gui2/store/stores/*" = ['UP']
|
||||
"src/calibre/gui2/tts/manager.py" = ['UP037']
|
||||
"src/calibre/utils/copy_files.py" = ['UP037']
|
||||
"src/calibre/utils/smartypants.py" = ['RUF055']
|
||||
"src/calibre/utils/smartypants.py" = ['RUF039', 'RUF055']
|
||||
"src/qt/*.py" = ['I']
|
||||
"src/qt/*.pyi" = ['I']
|
||||
|
||||
|
@ -17,7 +17,7 @@ import time
|
||||
from contextlib import contextmanager
|
||||
from functools import lru_cache
|
||||
|
||||
iswindows = re.search('win(32|64)', sys.platform)
|
||||
iswindows = re.search(r'win(32|64)', sys.platform)
|
||||
ismacos = 'darwin' in sys.platform
|
||||
isfreebsd = 'freebsd' in sys.platform
|
||||
isnetbsd = 'netbsd' in sys.platform
|
||||
|
@ -657,7 +657,7 @@ class Parser(SearchQueryParser): # {{{
|
||||
|
||||
if location == 'template':
|
||||
try:
|
||||
template, sep, query = regex.split('#@#:([tdnb]):', query, flags=regex.IGNORECASE)
|
||||
template, sep, query = regex.split(r'#@#:([tdnb]):', query, flags=regex.IGNORECASE)
|
||||
if sep:
|
||||
sep = sep.lower()
|
||||
else:
|
||||
|
@ -34,7 +34,7 @@ class CYBOOK(USBMS):
|
||||
|
||||
VENDOR_NAME = 'BOOKEEN'
|
||||
WINDOWS_MAIN_MEM = re.compile(r'CYBOOK_(OPUS|GEN3)__-FD')
|
||||
WINDOWS_CARD_A_MEM = re.compile('CYBOOK_(OPUS|GEN3)__-SD')
|
||||
WINDOWS_CARD_A_MEM = re.compile(r'CYBOOK_(OPUS|GEN3)__-SD')
|
||||
OSX_MAIN_MEM_VOL_PAT = re.compile(r'/Cybook')
|
||||
|
||||
EBOOK_DIR_MAIN = 'eBooks'
|
||||
@ -72,7 +72,7 @@ class ORIZON(CYBOOK):
|
||||
|
||||
VENDOR_NAME = ['BOOKEEN', 'LINUX']
|
||||
WINDOWS_MAIN_MEM = re.compile(r'(CYBOOK_ORIZON__-FD)|(FILE-STOR_GADGET)')
|
||||
WINDOWS_CARD_A_MEM = re.compile('(CYBOOK_ORIZON__-SD)|(FILE-STOR_GADGET)')
|
||||
WINDOWS_CARD_A_MEM = re.compile(r'(CYBOOK_ORIZON__-SD)|(FILE-STOR_GADGET)')
|
||||
|
||||
EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'Digital Editions'
|
||||
|
||||
|
@ -58,11 +58,11 @@ def build_template_regexp(template):
|
||||
|
||||
try:
|
||||
template = template.rpartition('/')[2]
|
||||
return re.compile(re.sub('{([^}]*)}', f, template) + r'([_\d]*$)')
|
||||
return re.compile(re.sub(r'{([^}]*)}', f, template) + r'([_\d]*$)')
|
||||
except:
|
||||
prints('Failed to parse template: %r'%template)
|
||||
template = '{title} - {authors}'
|
||||
return re.compile(re.sub('{([^}]*)}', f, template) + r'([_\d]*$)')
|
||||
return re.compile(re.sub(r'{([^}]*)}', f, template) + r'([_\d]*$)')
|
||||
|
||||
|
||||
def create_upload_path(mdata, fname, template, sanitize,
|
||||
|
@ -239,7 +239,7 @@ def generate_masthead(title, output_path=None, width=600, height=60):
|
||||
def escape_xpath_attr(value):
|
||||
if '"' in value:
|
||||
if "'" in value:
|
||||
parts = re.split('("+)', value)
|
||||
parts = re.split(r'("+)', value)
|
||||
ans = []
|
||||
for x in parts:
|
||||
if x:
|
||||
|
@ -42,7 +42,7 @@ def _metadata_from_table(soup, searchfor):
|
||||
# on the home page. cue some nasty special-case hacks...
|
||||
if re.match(r'^\s*'+searchfor+r'\s*$', td.decode_contents(), flags=re.I):
|
||||
meta = _detag(td.findNextSibling('td'))
|
||||
return re.sub('^:', '', meta).strip()
|
||||
return re.sub(r'^:', '', meta).strip()
|
||||
else:
|
||||
meta = _detag(td)
|
||||
return re.sub(r'^[^:]+:', '', meta).strip()
|
||||
@ -89,7 +89,7 @@ def _get_comments(soup):
|
||||
def _get_cover(soup, rdr):
|
||||
ans = None
|
||||
try:
|
||||
ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
|
||||
ans = soup.find('img', alt=re.compile(r'cover', flags=re.I))['src']
|
||||
except TypeError:
|
||||
# meeehh, no handy alt-tag goodness, try some hackery
|
||||
# the basic idea behind this is that in general, the cover image
|
||||
|
@ -16,7 +16,7 @@ XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||
_span_pat = re.compile(r'<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||
|
||||
LIGATURES = {
|
||||
# 'Æ': 'AE',
|
||||
@ -92,7 +92,7 @@ class DocAnalysis:
|
||||
elif format == 'pdf':
|
||||
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
linere = re.compile(r'(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
elif format == 'txt':
|
||||
linere = re.compile('.*?\n')
|
||||
self.lines = linere.findall(raw)
|
||||
@ -430,16 +430,16 @@ def book_designer_rules():
|
||||
if ans is None:
|
||||
ans = book_designer_rules.ans = [
|
||||
# HR
|
||||
(re.compile('<hr>', re.IGNORECASE),
|
||||
(re.compile(r'<hr>', re.IGNORECASE),
|
||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
(re.compile(r'<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
(re.compile(r'<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
return ans
|
||||
@ -458,7 +458,7 @@ class HTMLPreProcessor:
|
||||
re.IGNORECASE).search(src) is not None
|
||||
|
||||
def is_book_designer(self, raw):
|
||||
return re.search('<H2[^><]*id=BookTitle', raw) is not None
|
||||
return re.search(r'<H2[^><]*id=BookTitle', raw) is not None
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return "<!-- created by calibre's pdftohtml -->" in src[:1000]
|
||||
|
@ -27,7 +27,7 @@ class HeuristicProcessor:
|
||||
self.chapters_with_title = 0
|
||||
self.blanks_deleted = False
|
||||
self.blanks_between_paragraphs = False
|
||||
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||
self.linereg = re.compile(r'(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
|
||||
@ -108,7 +108,7 @@ class HeuristicProcessor:
|
||||
inspect. Percent is the minimum percent of line endings which should
|
||||
be marked up to return true.
|
||||
'''
|
||||
htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
|
||||
htm_end_ere = re.compile(r'</(p|div)>', re.DOTALL)
|
||||
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
|
||||
htm_end = htm_end_ere.findall(raw)
|
||||
line_end = line_end_ere.findall(raw)
|
||||
@ -209,7 +209,7 @@ class HeuristicProcessor:
|
||||
typical_chapters = 15000.
|
||||
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||
self.log.debug('minimum chapters required are: '+str(self.min_chapters))
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
heading = re.compile(r'<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log.debug('found ' + str(self.html_preprocess_sections) + ' pre-existing headings')
|
||||
|
||||
@ -299,7 +299,7 @@ class HeuristicProcessor:
|
||||
break
|
||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||
if n_lookahead_req:
|
||||
n_lookahead = re.sub('(ou|in|cha)', 'lookahead_', full_chapter_line)
|
||||
n_lookahead = re.sub(r'(ou|in|cha)', 'lookahead_', full_chapter_line)
|
||||
if not analyze:
|
||||
self.log.debug('Marked ' + str(self.html_preprocess_sections) + ' headings, ' + log_message)
|
||||
|
||||
@ -442,7 +442,7 @@ class HeuristicProcessor:
|
||||
# Delete microsoft 'smart' tags
|
||||
html = re.sub('(?i)</?st1:\\w+>', '', html)
|
||||
# Re-open self closing paragraph tags
|
||||
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
|
||||
html = re.sub(r'<p[^>/]*/>', '<p> </p>', html)
|
||||
# Get rid of empty span, bold, font, em, & italics tags
|
||||
fmt_tags = 'font|[ibu]|em|strong'
|
||||
open_fmt_pat, close_fmt_pat = fr'<(?:{fmt_tags})(?:\s[^>]*)?>', f'</(?:{fmt_tags})>'
|
||||
@ -462,8 +462,8 @@ class HeuristicProcessor:
|
||||
determines the type of html line ending used most commonly in a document
|
||||
use before calling docanalysis functions
|
||||
'''
|
||||
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
||||
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
||||
paras_reg = re.compile(r'<p[^>]*>', re.IGNORECASE)
|
||||
spans_reg = re.compile(r'<span[^>]*>', re.IGNORECASE)
|
||||
paras = len(paras_reg.findall(html))
|
||||
spans = len(spans_reg.findall(html))
|
||||
if spans > 1:
|
||||
@ -557,8 +557,8 @@ class HeuristicProcessor:
|
||||
|
||||
def detect_soft_breaks(self, html):
|
||||
line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')'
|
||||
line_two = '(?P<line_two>'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+ \
|
||||
'\\s*(?P<line_two_content>.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')'
|
||||
line_two = '(?P<line_two>'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_open)+ \
|
||||
'\\s*(?P<line_two_content>.*?)'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_close)+')'
|
||||
div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two
|
||||
div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
|
||||
|
||||
@ -596,8 +596,8 @@ class HeuristicProcessor:
|
||||
All other html is converted to text.
|
||||
'''
|
||||
hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
|
||||
if re.findall('(<|>)', replacement_break):
|
||||
if re.match('^<hr', replacement_break):
|
||||
if re.findall(r'(<|>)', replacement_break):
|
||||
if re.match(r'^<hr', replacement_break):
|
||||
if replacement_break.find('width') != -1:
|
||||
try:
|
||||
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
|
||||
@ -608,11 +608,11 @@ class HeuristicProcessor:
|
||||
else:
|
||||
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
|
||||
divpercent = (100 - width) // 2
|
||||
hr_open = re.sub('45', str(divpercent), hr_open)
|
||||
hr_open = re.sub(r'45', str(divpercent), hr_open)
|
||||
scene_break = hr_open+replacement_break+'</div>'
|
||||
else:
|
||||
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||
elif re.match('^<img', replacement_break):
|
||||
elif re.match(r'^<img', replacement_break):
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
else:
|
||||
from calibre.utils.html2text import html2text
|
||||
@ -638,7 +638,7 @@ class HeuristicProcessor:
|
||||
empty_paragraph = '\n<p> </p>\n'
|
||||
self.in_blockquote = False
|
||||
self.previous_was_paragraph = False
|
||||
html = re.sub('</?a[^>]*>', '', html)
|
||||
html = re.sub(r'</?a[^>]*>', '', html)
|
||||
|
||||
def convert_styles(match):
|
||||
# print('raw styles are: '+match.group('styles'))
|
||||
|
@ -91,7 +91,7 @@ class HTMLFile:
|
||||
|
||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||
HTML_PAT_BIN = re.compile(br'<\s*html', re.IGNORECASE)
|
||||
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
||||
TITLE_PAT = re.compile(r'<title>([^<>]+)</title>', re.IGNORECASE)
|
||||
LINK_PAT = re.compile(
|
||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||
re.DOTALL|re.IGNORECASE)
|
||||
|
@ -269,7 +269,7 @@ class OEB2HTMLInlineCSSizer(OEB2HTML):
|
||||
tag = 'div'
|
||||
# Add page-break-brefore: always because renders typically treat a new file (we're merging files)
|
||||
# as a page break and remove all other page break types that might be set.
|
||||
style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a)
|
||||
style_a = 'page-break-before: always; %s' % re.sub(r'page-break-[^:]+:[^;]+;?', '', style_a)
|
||||
# Remove unnecessary spaces.
|
||||
style_a = re.sub(r'\s{2,}', ' ', style_a).strip()
|
||||
tags.append(tag)
|
||||
|
@ -34,8 +34,8 @@ class Hyphenator:
|
||||
def _insert_pattern(self, pattern):
|
||||
# Convert a pattern like 'a1bc3d4' into a string of chars 'abcd'
|
||||
# and a list of points [ 1, 0, 3, 4 ].
|
||||
chars = re.sub('[0-9]', '', pattern)
|
||||
points = [int(d or 0) for d in re.split('[.a-z]', pattern)]
|
||||
chars = re.sub(r'[0-9]', '', pattern)
|
||||
points = [int(d or 0) for d in re.split(r'[.a-z]', pattern)]
|
||||
|
||||
# Insert the pattern into the tree. Each character finds a dict
|
||||
# another level down in the tree, and leaf nodes have the list of
|
||||
|
@ -163,7 +163,7 @@ class HTMLConverter:
|
||||
# Fix Book Designer markup
|
||||
BOOK_DESIGNER = [
|
||||
# HR
|
||||
(re.compile('<hr>', re.IGNORECASE),
|
||||
(re.compile(r'<hr>', re.IGNORECASE),
|
||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
@ -279,7 +279,7 @@ class HTMLConverter:
|
||||
if isinstance(src, bytes):
|
||||
src = src.decode('utf-8', 'replace')
|
||||
match = self.PAGE_BREAK_PAT.search(src)
|
||||
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
|
||||
if match and not re.match(r'avoid', match.group(1), re.IGNORECASE):
|
||||
self.page_break_found = True
|
||||
ncss, npcss = self.parse_css(src)
|
||||
if ncss:
|
||||
@ -324,10 +324,10 @@ class HTMLConverter:
|
||||
|
||||
def is_baen(self, soup):
|
||||
return bool(soup.find('meta', attrs={'name':'Publisher',
|
||||
'content':re.compile('Baen', re.IGNORECASE)}))
|
||||
'content':re.compile(r'Baen', re.IGNORECASE)}))
|
||||
|
||||
def is_book_designer(self, raw):
|
||||
return bool(re.search('<H2[^><]*id=BookTitle', raw))
|
||||
return bool(re.search(r'<H2[^><]*id=BookTitle', raw))
|
||||
|
||||
def preprocess(self, raw):
|
||||
nmassage = []
|
||||
@ -1152,7 +1152,7 @@ class HTMLConverter:
|
||||
|
||||
def font_weight(val):
|
||||
ans = 0
|
||||
m = re.search('([0-9]+)', val)
|
||||
m = re.search(r'([0-9]+)', val)
|
||||
if m:
|
||||
ans = int(m.group(1))
|
||||
elif val.find('bold') >= 0 or val.find('strong') >= 0:
|
||||
@ -1544,7 +1544,7 @@ class HTMLConverter:
|
||||
with open(path, 'rb') as f:
|
||||
src = f.read().decode('utf-8', 'replace')
|
||||
match = self.PAGE_BREAK_PAT.search(src)
|
||||
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
|
||||
if match and not re.match(r'avoid', match.group(1), re.IGNORECASE):
|
||||
self.page_break_found = True
|
||||
ncss, npcss = self.parse_css(src)
|
||||
except OSError:
|
||||
@ -1869,11 +1869,11 @@ def process_file(path, options, logger):
|
||||
header.append(fheader + ' ')
|
||||
book, fonts = Book(options, logger, header=header, **args)
|
||||
le = re.compile(options.link_exclude) if options.link_exclude else \
|
||||
re.compile('$')
|
||||
re.compile(r'$')
|
||||
pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
|
||||
re.compile('$')
|
||||
re.compile(r'$')
|
||||
fpb = re.compile(options.force_page_break, re.IGNORECASE) if options.force_page_break else \
|
||||
re.compile('$')
|
||||
re.compile(r'$')
|
||||
cq = options.chapter_attr.split(',')
|
||||
if len(cq) < 3:
|
||||
raise ValueError('The --chapter-attr setting must have 2 commas.')
|
||||
|
@ -213,7 +213,7 @@ class Row:
|
||||
def __init__(self, conv, row, css, colpad):
|
||||
self.cells = []
|
||||
self.colpad = colpad
|
||||
cells = row.findAll(re.compile('td|th', re.IGNORECASE))
|
||||
cells = row.findAll(re.compile(r'td|th', re.IGNORECASE))
|
||||
self.targets = []
|
||||
for cell in cells:
|
||||
ccss = conv.tag_css(cell, css)[0]
|
||||
|
@ -172,7 +172,7 @@ def get_title_sort_pat(lang=None):
|
||||
except:
|
||||
ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE)
|
||||
else:
|
||||
ans = re.compile('^$') # matches only the empty string
|
||||
ans = re.compile(r'^$') # matches only the empty string
|
||||
_title_pats[lang] = ans
|
||||
return ans
|
||||
|
||||
|
@ -139,7 +139,7 @@ def metadata_from_filename(name, pat=None, fallback_pat=None):
|
||||
try:
|
||||
pat = regex.compile(prefs.get('filename_pattern'), flags=regex.UNICODE | regex.VERSION0 | regex.FULLCASE)
|
||||
except Exception:
|
||||
pat = regex.compile('(?P<title>.+) - (?P<author>[^_]+)', flags=regex.UNICODE | regex.VERSION0 | regex.FULLCASE)
|
||||
pat = regex.compile(r'(?P<title>.+) - (?P<author>[^_]+)', flags=regex.UNICODE | regex.VERSION0 | regex.FULLCASE)
|
||||
|
||||
name = name.replace('_', ' ')
|
||||
match = pat.search(name)
|
||||
|
@ -59,4 +59,4 @@ def set_metadata(stream, mi):
|
||||
MetadataWriter(stream, mi)
|
||||
|
||||
stream.seek(0)
|
||||
stream.write(re.sub('[^-A-Za-z0-9 ]+', '_', mi.title).ljust(31, '\x00')[:31].encode('ascii', 'replace') + b'\x00')
|
||||
stream.write(re.sub(r'[^-A-Za-z0-9 ]+', '_', mi.title).ljust(31, '\x00')[:31].encode('ascii', 'replace') + b'\x00')
|
||||
|
@ -365,7 +365,7 @@ class Worker(Thread): # Get details {{{
|
||||
r'([0-9.,]+) ?(out of|von|van|su|étoiles sur|つ星のうち|de un máximo de|de|av) '
|
||||
r'([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas|sterren|stjärnor)){0,1}'
|
||||
)
|
||||
self.ratings_pat_cn = re.compile('([0-9.]+) 颗星,最多 5 颗星')
|
||||
self.ratings_pat_cn = re.compile(r'([0-9.]+) 颗星,最多 5 颗星')
|
||||
self.ratings_pat_jp = re.compile(r'\d+つ星のうち([\d\.]+)')
|
||||
|
||||
lm = {
|
||||
|
@ -165,7 +165,7 @@ def wayback_url_processor(url):
|
||||
if url.startswith('/'):
|
||||
# Use original URL instead of absolutizing to wayback URL as wayback is
|
||||
# slow
|
||||
m = re.search('https?:', url)
|
||||
m = re.search(r'https?:', url)
|
||||
if m is None:
|
||||
url = 'https://web.archive.org' + url
|
||||
else:
|
||||
|
@ -380,7 +380,7 @@ class MobiReader:
|
||||
self.processed_html = re.sub(
|
||||
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html)
|
||||
bods = htmls = 0
|
||||
for x in re.finditer('</body>|</html>', self.processed_html):
|
||||
for x in re.finditer(r'</body>|</html>', self.processed_html):
|
||||
if x == '</body>':
|
||||
bods +=1
|
||||
else:
|
||||
|
@ -155,7 +155,7 @@ def hfix(name, raw):
|
||||
return raw
|
||||
|
||||
|
||||
CLI_HELP = {x:hfix(x, re.sub('<.*?>', '', y)) for x, y in iteritems(HELP)}
|
||||
CLI_HELP = {x:hfix(x, re.sub(r'<.*?>', '', y)) for x, y in iteritems(HELP)}
|
||||
# }}}
|
||||
|
||||
|
||||
|
@ -36,7 +36,7 @@ class Patterns:
|
||||
# French words with prefixes are reduced to the stem word, so that the
|
||||
# words appear only once in the word list
|
||||
self.fr_elision_pat = regex.compile(
|
||||
"^(?:l|d|m|t|s|j|c|ç|lorsqu|puisqu|quoiqu|qu)['’]", flags=regex.UNICODE | regex.VERSION1 | regex.IGNORECASE)
|
||||
r"^(?:l|d|m|t|s|j|c|ç|lorsqu|puisqu|quoiqu|qu)['’]", flags=regex.UNICODE | regex.VERSION1 | regex.IGNORECASE)
|
||||
|
||||
|
||||
def patterns():
|
||||
|
@ -102,7 +102,7 @@ class SVGRasterizer:
|
||||
|
||||
if view_box is not None:
|
||||
try:
|
||||
box = [float(x) for x in filter(None, re.split('[, ]', view_box))]
|
||||
box = [float(x) for x in filter(None, re.split(r'[, ]', view_box))]
|
||||
sizes = [box[2]-box[0], box[3] - box[1]]
|
||||
except (TypeError, ValueError, IndexError):
|
||||
logger.warn('SVG image has invalid viewBox="%s", ignoring the viewBox' % view_box)
|
||||
|
@ -152,7 +152,7 @@ def flip_image(img, flip):
|
||||
|
||||
|
||||
def flip_images(raw):
|
||||
for match in re.finditer('<IMG[^>]+/?>', raw, flags=re.I):
|
||||
for match in re.finditer(r'<IMG[^>]+/?>', raw, flags=re.I):
|
||||
img = match.group()
|
||||
m = re.search(r'class="(x|y|xy)flip"', img)
|
||||
if m is None:
|
||||
@ -174,5 +174,5 @@ def flip_images(raw):
|
||||
counter += 1
|
||||
return m.group(1).rstrip('/') + f' alt="Image {counter}"/>'
|
||||
|
||||
raw = re.sub('(<IMG[^>]+)/?>', add_alt, raw, flags=re.I)
|
||||
raw = re.sub(r'(<IMG[^>]+)/?>', add_alt, raw, flags=re.I)
|
||||
return raw
|
||||
|
@ -121,7 +121,7 @@ class Font:
|
||||
self.metrics, self.compress = metrics, compress
|
||||
self.is_otf = self.metrics.is_otf
|
||||
self.subset_tag = str(
|
||||
re.sub('.', lambda m: codepoint_to_chr(int(m.group())+ord('A')), oct(num).replace('o', '')
|
||||
re.sub(r'.', lambda m: codepoint_to_chr(int(m.group())+ord('A')), oct(num).replace('o', '')
|
||||
)).rjust(6, 'A')
|
||||
self.font_stream = FontStream(metrics.is_otf, compress=compress)
|
||||
try:
|
||||
|
@ -199,11 +199,11 @@ class PMLMLizer:
|
||||
text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)
|
||||
|
||||
# Remove excess spaces at beginning and end of lines
|
||||
text = re.sub('(?m)^[ ]+', '', text)
|
||||
text = re.sub('(?m)[ ]+$', '', text)
|
||||
text = re.sub(r'(?m)^[ ]+', '', text)
|
||||
text = re.sub(r'(?m)[ ]+$', '', text)
|
||||
|
||||
# Remove excessive spaces
|
||||
text = re.sub('[ ]{2,}', ' ', text)
|
||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||
|
||||
# Condense excessive \c empty line sequences.
|
||||
text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text)
|
||||
@ -213,7 +213,7 @@ class PMLMLizer:
|
||||
if self.opts.remove_paragraph_spacing:
|
||||
text = re.sub('\n{2,}', '\n', text)
|
||||
# Only indent lines that don't have special formatting
|
||||
text = re.sub('(?imu)^(?P<text>.+)$', lambda mo: mo.group('text')
|
||||
text = re.sub(r'(?imu)^(?P<text>.+)$', lambda mo: mo.group('text')
|
||||
if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text)
|
||||
else:
|
||||
text = re.sub('\n{3,}', '\n\n', text)
|
||||
|
@ -19,11 +19,11 @@ def tounicode(tree_or_node, **kwargs):
|
||||
|
||||
|
||||
REGEXES = {
|
||||
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I), # noqa: E501
|
||||
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
|
||||
'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
|
||||
'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I), # noqa: E501
|
||||
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
|
||||
'unlikelyCandidatesRe': re.compile(r'combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I), # noqa: E501
|
||||
'okMaybeItsACandidateRe': re.compile(r'and|article|body|column|main|shadow',re.I),
|
||||
'positiveRe': re.compile(r'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
|
||||
'negativeRe': re.compile(r'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I), # noqa: E501
|
||||
'divToPElementsRe': re.compile(r'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
|
||||
# 'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
|
||||
# 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
|
||||
# 'trimRe': re.compile('^\s+|\s+$/'),
|
||||
|
@ -121,7 +121,7 @@ class RTFMLizer:
|
||||
self.log.debug('Converting %s to RTF markup...' % item.href)
|
||||
# Removing comments is needed as comments with -- inside them can
|
||||
# cause fromstring() to fail
|
||||
content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL)
|
||||
content = re.sub(r'<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL)
|
||||
content = self.remove_newlines(content)
|
||||
content = self.remove_tabs(content)
|
||||
content = safe_xml_fromstring(content)
|
||||
@ -198,7 +198,7 @@ class RTFMLizer:
|
||||
text = re.sub('%s{3,}' % os.linesep, f'{os.linesep}{os.linesep}', text)
|
||||
|
||||
# Remove excessive spaces
|
||||
text = re.sub('[ ]{2,}', ' ', text)
|
||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||
text = re.sub('\t{2,}', '\t', text)
|
||||
text = text.replace('\t ', '\t')
|
||||
|
||||
|
@ -652,7 +652,7 @@ class ProcessTokens:
|
||||
return f'cw<{pre}<{token}<nu<{type}\n'
|
||||
|
||||
def __language_func(self, pre, token, num):
|
||||
lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
|
||||
lang_name = self.__language_dict.get(int(re.search(r'[0-9]+', num).group()))
|
||||
if not lang_name:
|
||||
lang_name = 'not defined'
|
||||
if self.__run_level > 3:
|
||||
|
@ -165,13 +165,13 @@ class SNBMLizer:
|
||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||
if self.opts.remove_paragraph_spacing:
|
||||
text = re.sub('\n{2,}', '\n', text)
|
||||
text = re.sub('(?imu)^(?=.)', '\t', text)
|
||||
text = re.sub(r'(?imu)^(?=.)', '\t', text)
|
||||
else:
|
||||
text = re.sub('\n{3,}', '\n\n', text)
|
||||
|
||||
# Replace spaces at the beginning and end of lines
|
||||
text = re.sub('(?imu)^[ ]+', '', text)
|
||||
text = re.sub('(?imu)[ ]+$', '', text)
|
||||
text = re.sub(r'(?imu)^[ ]+', '', text)
|
||||
text = re.sub(r'(?imu)[ ]+$', '', text)
|
||||
|
||||
if self.opts.snb_max_line_length:
|
||||
max_length = self.opts.snb_max_line_length
|
||||
|
@ -6,117 +6,117 @@ import re
|
||||
|
||||
|
||||
def unsmarten(txt):
|
||||
txt = re.sub('¢|¢|¢', r'{c\}', txt) # cent
|
||||
txt = re.sub('£|£|£', r'{L-}', txt) # pound
|
||||
txt = re.sub('¥|¥|¥', r'{Y=}', txt) # yen
|
||||
txt = re.sub('©|©|©', r'{(c)}', txt) # copyright
|
||||
txt = re.sub('®|®|®', r'{(r)}', txt) # registered
|
||||
txt = re.sub('¼|¼|¼', r'{1/4}', txt) # quarter
|
||||
txt = re.sub('½|½|½', r'{1/2}', txt) # half
|
||||
txt = re.sub('¾|¾|¾', r'{3/4}', txt) # three-quarter
|
||||
txt = re.sub('À|À|À', r'{A`)}', txt) # A-grave
|
||||
txt = re.sub('Á|Á|Á', r"{A'}", txt) # A-acute
|
||||
txt = re.sub('Â|Â|Â', r'{A^}', txt) # A-circumflex
|
||||
txt = re.sub('Ã|Ã|Ã', r'{A~}', txt) # A-tilde
|
||||
txt = re.sub('Ä|Ä|Ä', r'{A"}', txt) # A-umlaut
|
||||
txt = re.sub('Å|Å|Å', r'{Ao}', txt) # A-ring
|
||||
txt = re.sub('Æ|Æ|Æ', r'{AE}', txt) # AE
|
||||
txt = re.sub('Ç|Ç|Ç', r'{C,}', txt) # C-cedilla
|
||||
txt = re.sub('È|È|È', r'{E`}', txt) # E-grave
|
||||
txt = re.sub('É|É|É', r"{E'}", txt) # E-acute
|
||||
txt = re.sub('Ê|Ê|Ê', r'{E^}', txt) # E-circumflex
|
||||
txt = re.sub('Ë|Ë|Ë', r'{E"}', txt) # E-umlaut
|
||||
txt = re.sub('Ì|Ì|Ì', r'{I`}', txt) # I-grave
|
||||
txt = re.sub('Í|Í|Í', r"{I'}", txt) # I-acute
|
||||
txt = re.sub('Î|Î|Î', r'{I^}', txt) # I-circumflex
|
||||
txt = re.sub('Ï|Ï|Ï', r'{I"}', txt) # I-umlaut
|
||||
txt = re.sub('Ð|Ð|Ð', r'{D-}', txt) # ETH
|
||||
txt = re.sub('Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde
|
||||
txt = re.sub('Ò|Ò|Ò', r'{O`}', txt) # O-grave
|
||||
txt = re.sub('Ó|Ó|Ó', r"{O'}", txt) # O-acute
|
||||
txt = re.sub('Ô|Ô|Ô', r'{O^}', txt) # O-circumflex
|
||||
txt = re.sub('Õ|Õ|Õ', r'{O~}', txt) # O-tilde
|
||||
txt = re.sub('Ö|Ö|Ö', r'{O"}', txt) # O-umlaut
|
||||
txt = re.sub('×|×|×', r'{x}', txt) # dimension
|
||||
txt = re.sub('Ø|Ø|Ø', r'{O/}', txt) # O-slash
|
||||
txt = re.sub('Ù|Ù|Ù', r'{U`}', txt) # U-grave
|
||||
txt = re.sub('Ú|Ú|Ú', r"{U'}", txt) # U-acute
|
||||
txt = re.sub('Û|Û|Û', r'{U^}', txt) # U-circumflex
|
||||
txt = re.sub('Ü|Ü|Ü', r'{U"}', txt) # U-umlaut
|
||||
txt = re.sub('Ý|Ý|Ý', r"{Y'}", txt) # Y-grave
|
||||
txt = re.sub('ß|ß|ß', r'{sz}', txt) # sharp-s
|
||||
txt = re.sub('à|à|à', r'{a`}', txt) # a-grave
|
||||
txt = re.sub('á|á|á', r"{a'}", txt) # a-acute
|
||||
txt = re.sub('â|â|â', r'{a^}', txt) # a-circumflex
|
||||
txt = re.sub('ã|ã|ã', r'{a~}', txt) # a-tilde
|
||||
txt = re.sub('ä|ä|ä', r'{a"}', txt) # a-umlaut
|
||||
txt = re.sub('å|å|å', r'{ao}', txt) # a-ring
|
||||
txt = re.sub('æ|æ|æ', r'{ae}', txt) # ae
|
||||
txt = re.sub('ç|ç|ç', r'{c,}', txt) # c-cedilla
|
||||
txt = re.sub('è|è|è', r'{e`}', txt) # e-grave
|
||||
txt = re.sub('é|é|é', r"{e'}", txt) # e-acute
|
||||
txt = re.sub('ê|ê|ê', r'{e^}', txt) # e-circumflex
|
||||
txt = re.sub('ë|ë|ë', r'{e"}', txt) # e-umlaut
|
||||
txt = re.sub('ì|ì|ì', r'{i`}', txt) # i-grave
|
||||
txt = re.sub('í|í|í', r"{i'}", txt) # i-acute
|
||||
txt = re.sub('î|î|î', r'{i^}', txt) # i-circumflex
|
||||
txt = re.sub('ï|ï|ï', r'{i"}', txt) # i-umlaut
|
||||
txt = re.sub('ð|ð|ð', r'{d-}', txt) # eth
|
||||
txt = re.sub('ñ|ñ|ñ', r'{n~}', txt) # n-tilde
|
||||
txt = re.sub('ò|ò|ò', r'{o`}', txt) # o-grave
|
||||
txt = re.sub('ó|ó|ó', r"{o'}", txt) # o-acute
|
||||
txt = re.sub('ô|ô|ô', r'{o^}', txt) # o-circumflex
|
||||
txt = re.sub('õ|õ|õ', r'{o~}', txt) # o-tilde
|
||||
txt = re.sub('ö|ö|ö', r'{o"}', txt) # o-umlaut
|
||||
txt = re.sub('ø|ø|ø', r'{o/}', txt) # o-stroke
|
||||
txt = re.sub('ù|ù|ù', r'{u`}', txt) # u-grave
|
||||
txt = re.sub('ú|ú|ú', r"{u'}", txt) # u-acute
|
||||
txt = re.sub('û|û|û', r'{u^}', txt) # u-circumflex
|
||||
txt = re.sub('ü|ü|ü', r'{u"}', txt) # u-umlaut
|
||||
txt = re.sub('ý|ý|ý', r"{y'}", txt) # y-acute
|
||||
txt = re.sub('ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
||||
txt = re.sub(r'¢|¢|¢', r'{c\}', txt) # cent
|
||||
txt = re.sub(r'£|£|£', r'{L-}', txt) # pound
|
||||
txt = re.sub(r'¥|¥|¥', r'{Y=}', txt) # yen
|
||||
txt = re.sub(r'©|©|©', r'{(c)}', txt) # copyright
|
||||
txt = re.sub(r'®|®|®', r'{(r)}', txt) # registered
|
||||
txt = re.sub(r'¼|¼|¼', r'{1/4}', txt) # quarter
|
||||
txt = re.sub(r'½|½|½', r'{1/2}', txt) # half
|
||||
txt = re.sub(r'¾|¾|¾', r'{3/4}', txt) # three-quarter
|
||||
txt = re.sub(r'À|À|À', r'{A`)}', txt) # A-grave
|
||||
txt = re.sub(r'Á|Á|Á', r"{A'}", txt) # A-acute
|
||||
txt = re.sub(r'Â|Â|Â', r'{A^}', txt) # A-circumflex
|
||||
txt = re.sub(r'Ã|Ã|Ã', r'{A~}', txt) # A-tilde
|
||||
txt = re.sub(r'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut
|
||||
txt = re.sub(r'Å|Å|Å', r'{Ao}', txt) # A-ring
|
||||
txt = re.sub(r'Æ|Æ|Æ', r'{AE}', txt) # AE
|
||||
txt = re.sub(r'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla
|
||||
txt = re.sub(r'È|È|È', r'{E`}', txt) # E-grave
|
||||
txt = re.sub(r'É|É|É', r"{E'}", txt) # E-acute
|
||||
txt = re.sub(r'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex
|
||||
txt = re.sub(r'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut
|
||||
txt = re.sub(r'Ì|Ì|Ì', r'{I`}', txt) # I-grave
|
||||
txt = re.sub(r'Í|Í|Í', r"{I'}", txt) # I-acute
|
||||
txt = re.sub(r'Î|Î|Î', r'{I^}', txt) # I-circumflex
|
||||
txt = re.sub(r'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut
|
||||
txt = re.sub(r'Ð|Ð|Ð', r'{D-}', txt) # ETH
|
||||
txt = re.sub(r'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde
|
||||
txt = re.sub(r'Ò|Ò|Ò', r'{O`}', txt) # O-grave
|
||||
txt = re.sub(r'Ó|Ó|Ó', r"{O'}", txt) # O-acute
|
||||
txt = re.sub(r'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex
|
||||
txt = re.sub(r'Õ|Õ|Õ', r'{O~}', txt) # O-tilde
|
||||
txt = re.sub(r'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut
|
||||
txt = re.sub(r'×|×|×', r'{x}', txt) # dimension
|
||||
txt = re.sub(r'Ø|Ø|Ø', r'{O/}', txt) # O-slash
|
||||
txt = re.sub(r'Ù|Ù|Ù', r'{U`}', txt) # U-grave
|
||||
txt = re.sub(r'Ú|Ú|Ú', r"{U'}", txt) # U-acute
|
||||
txt = re.sub(r'Û|Û|Û', r'{U^}', txt) # U-circumflex
|
||||
txt = re.sub(r'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut
|
||||
txt = re.sub(r'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave
|
||||
txt = re.sub(r'ß|ß|ß', r'{sz}', txt) # sharp-s
|
||||
txt = re.sub(r'à|à|à', r'{a`}', txt) # a-grave
|
||||
txt = re.sub(r'á|á|á', r"{a'}", txt) # a-acute
|
||||
txt = re.sub(r'â|â|â', r'{a^}', txt) # a-circumflex
|
||||
txt = re.sub(r'ã|ã|ã', r'{a~}', txt) # a-tilde
|
||||
txt = re.sub(r'ä|ä|ä', r'{a"}', txt) # a-umlaut
|
||||
txt = re.sub(r'å|å|å', r'{ao}', txt) # a-ring
|
||||
txt = re.sub(r'æ|æ|æ', r'{ae}', txt) # ae
|
||||
txt = re.sub(r'ç|ç|ç', r'{c,}', txt) # c-cedilla
|
||||
txt = re.sub(r'è|è|è', r'{e`}', txt) # e-grave
|
||||
txt = re.sub(r'é|é|é', r"{e'}", txt) # e-acute
|
||||
txt = re.sub(r'ê|ê|ê', r'{e^}', txt) # e-circumflex
|
||||
txt = re.sub(r'ë|ë|ë', r'{e"}', txt) # e-umlaut
|
||||
txt = re.sub(r'ì|ì|ì', r'{i`}', txt) # i-grave
|
||||
txt = re.sub(r'í|í|í', r"{i'}", txt) # i-acute
|
||||
txt = re.sub(r'î|î|î', r'{i^}', txt) # i-circumflex
|
||||
txt = re.sub(r'ï|ï|ï', r'{i"}', txt) # i-umlaut
|
||||
txt = re.sub(r'ð|ð|ð', r'{d-}', txt) # eth
|
||||
txt = re.sub(r'ñ|ñ|ñ', r'{n~}', txt) # n-tilde
|
||||
txt = re.sub(r'ò|ò|ò', r'{o`}', txt) # o-grave
|
||||
txt = re.sub(r'ó|ó|ó', r"{o'}", txt) # o-acute
|
||||
txt = re.sub(r'ô|ô|ô', r'{o^}', txt) # o-circumflex
|
||||
txt = re.sub(r'õ|õ|õ', r'{o~}', txt) # o-tilde
|
||||
txt = re.sub(r'ö|ö|ö', r'{o"}', txt) # o-umlaut
|
||||
txt = re.sub(r'ø|ø|ø', r'{o/}', txt) # o-stroke
|
||||
txt = re.sub(r'ù|ù|ù', r'{u`}', txt) # u-grave
|
||||
txt = re.sub(r'ú|ú|ú', r"{u'}", txt) # u-acute
|
||||
txt = re.sub(r'û|û|û', r'{u^}', txt) # u-circumflex
|
||||
txt = re.sub(r'ü|ü|ü', r'{u"}', txt) # u-umlaut
|
||||
txt = re.sub(r'ý|ý|ý', r"{y'}", txt) # y-acute
|
||||
txt = re.sub(r'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
||||
|
||||
txt = re.sub('Č|Č|Č', r'{Cˇ}', txt) # C-caron
|
||||
txt = re.sub('č|č|č', r'{cˇ}', txt) # c-caron
|
||||
txt = re.sub('Ď|Ď|Ď', r'{Dˇ}', txt) # D-caron
|
||||
txt = re.sub('ď|ď|ď', r'{dˇ}', txt) # d-caron
|
||||
txt = re.sub('Ě|Ě|Ě', r'{Eˇ}', txt) # E-caron
|
||||
txt = re.sub('ě|ě|ě', r'{eˇ}', txt) # e-caron
|
||||
txt = re.sub('Ĺ|Ĺ|Ĺ', r"{L'}", txt) # L-acute
|
||||
txt = re.sub('ĺ|ĺ|ĺ', r"{l'}", txt) # l-acute
|
||||
txt = re.sub('Ľ|Ľ|Ľ', r'{Lˇ}', txt) # L-caron
|
||||
txt = re.sub('ľ|ľ|ľ', r'{lˇ}', txt) # l-caron
|
||||
txt = re.sub('Ň|Ň|Ň', r'{Nˇ}', txt) # N-caron
|
||||
txt = re.sub('ň|ň|ň', r'{nˇ}', txt) # n-caron
|
||||
txt = re.sub(r'Č|Č|Č', r'{Cˇ}', txt) # C-caron
|
||||
txt = re.sub(r'č|č|č', r'{cˇ}', txt) # c-caron
|
||||
txt = re.sub(r'Ď|Ď|Ď', r'{Dˇ}', txt) # D-caron
|
||||
txt = re.sub(r'ď|ď|ď', r'{dˇ}', txt) # d-caron
|
||||
txt = re.sub(r'Ě|Ě|Ě', r'{Eˇ}', txt) # E-caron
|
||||
txt = re.sub(r'ě|ě|ě', r'{eˇ}', txt) # e-caron
|
||||
txt = re.sub(r'Ĺ|Ĺ|Ĺ', r"{L'}", txt) # L-acute
|
||||
txt = re.sub(r'ĺ|ĺ|ĺ', r"{l'}", txt) # l-acute
|
||||
txt = re.sub(r'Ľ|Ľ|Ľ', r'{Lˇ}', txt) # L-caron
|
||||
txt = re.sub(r'ľ|ľ|ľ', r'{lˇ}', txt) # l-caron
|
||||
txt = re.sub(r'Ň|Ň|Ň', r'{Nˇ}', txt) # N-caron
|
||||
txt = re.sub(r'ň|ň|ň', r'{nˇ}', txt) # n-caron
|
||||
|
||||
txt = re.sub('Œ|Œ|Œ', r'{OE}', txt) # OE
|
||||
txt = re.sub('œ|œ|œ', r'{oe}', txt) # oe
|
||||
txt = re.sub(r'Œ|Œ|Œ', r'{OE}', txt) # OE
|
||||
txt = re.sub(r'œ|œ|œ', r'{oe}', txt) # oe
|
||||
|
||||
txt = re.sub('Ŕ|Ŕ|Ŕ', r"{R'}", txt) # R-acute
|
||||
txt = re.sub('ŕ|ŕ|ŕ', r"{r'}", txt) # r-acute
|
||||
txt = re.sub('Ř|Ř|Ř', r'{Rˇ}', txt) # R-caron
|
||||
txt = re.sub('ř|ř|ř', r'{rˇ}', txt) # r-caron
|
||||
txt = re.sub('Ŝ|Ŝ', r'{S^}', txt) # S-circumflex
|
||||
txt = re.sub('ŝ|ŝ', r'{s^}', txt) # s-circumflex
|
||||
txt = re.sub('Š|Š|Š', r'{Sˇ}', txt) # S-caron
|
||||
txt = re.sub('š|š|š', r'{sˇ}', txt) # s-caron
|
||||
txt = re.sub('Ť|Ť|Ť', r'{Tˇ}', txt) # T-caron
|
||||
txt = re.sub('ť|ť|ť', r'{tˇ}', txt) # t-caron
|
||||
txt = re.sub('Ů|Ů|Ů', r'{U°}', txt) # U-ring
|
||||
txt = re.sub('ů|ů|ů', r'{u°}', txt) # u-ring
|
||||
txt = re.sub('Ž|Ž|Ž', r'{Zˇ}', txt) # Z-caron
|
||||
txt = re.sub('ž|ž|ž', r'{zˇ}', txt) # z-caron
|
||||
txt = re.sub(r'Ŕ|Ŕ|Ŕ', r"{R'}", txt) # R-acute
|
||||
txt = re.sub(r'ŕ|ŕ|ŕ', r"{r'}", txt) # r-acute
|
||||
txt = re.sub(r'Ř|Ř|Ř', r'{Rˇ}', txt) # R-caron
|
||||
txt = re.sub(r'ř|ř|ř', r'{rˇ}', txt) # r-caron
|
||||
txt = re.sub(r'Ŝ|Ŝ', r'{S^}', txt) # S-circumflex
|
||||
txt = re.sub(r'ŝ|ŝ', r'{s^}', txt) # s-circumflex
|
||||
txt = re.sub(r'Š|Š|Š', r'{Sˇ}', txt) # S-caron
|
||||
txt = re.sub(r'š|š|š', r'{sˇ}', txt) # s-caron
|
||||
txt = re.sub(r'Ť|Ť|Ť', r'{Tˇ}', txt) # T-caron
|
||||
txt = re.sub(r'ť|ť|ť', r'{tˇ}', txt) # t-caron
|
||||
txt = re.sub(r'Ů|Ů|Ů', r'{U°}', txt) # U-ring
|
||||
txt = re.sub(r'ů|ů|ů', r'{u°}', txt) # u-ring
|
||||
txt = re.sub(r'Ž|Ž|Ž', r'{Zˇ}', txt) # Z-caron
|
||||
txt = re.sub(r'ž|ž|ž', r'{zˇ}', txt) # z-caron
|
||||
|
||||
txt = re.sub('•|•|•', r'{*}', txt) # bullet
|
||||
txt = re.sub('₣|₣', r'{Fr}', txt) # Franc
|
||||
txt = re.sub('₤|₤', r'{L=}', txt) # Lira
|
||||
txt = re.sub('₨|₨', r'{Rs}', txt) # Rupee
|
||||
txt = re.sub('€|€|€', r'{C=}', txt) # euro
|
||||
txt = re.sub('™|™|™', r'{tm}', txt) # trademark
|
||||
txt = re.sub('♠|♠|♠', r'{spade}', txt) # spade
|
||||
txt = re.sub('♣|♣|♣', r'{club}', txt) # club
|
||||
txt = re.sub('♥|♥|♥', r'{heart}', txt) # heart
|
||||
txt = re.sub('♦|♦|♦', r'{diamond}', txt) # diamond
|
||||
txt = re.sub(r'•|•|•', r'{*}', txt) # bullet
|
||||
txt = re.sub(r'₣|₣', r'{Fr}', txt) # Franc
|
||||
txt = re.sub(r'₤|₤', r'{L=}', txt) # Lira
|
||||
txt = re.sub(r'₨|₨', r'{Rs}', txt) # Rupee
|
||||
txt = re.sub(r'€|€|€', r'{C=}', txt) # euro
|
||||
txt = re.sub(r'™|™|™', r'{tm}', txt) # trademark
|
||||
txt = re.sub(r'♠|♠|♠', r'{spade}', txt) # spade
|
||||
txt = re.sub(r'♣|♣|♣', r'{club}', txt) # club
|
||||
txt = re.sub(r'♥|♥|♥', r'{heart}', txt) # heart
|
||||
txt = re.sub(r'♦|♦|♦', r'{diamond}', txt) # diamond
|
||||
|
||||
# Move into main code?
|
||||
# txt = re.sub('\xa0', r'p. ', txt) # blank paragraph
|
||||
|
@ -51,9 +51,9 @@ class MarkdownMLizer(OEB2HTML):
|
||||
|
||||
def tidy_up(self, text):
|
||||
# Remove blank space form beginning of paragraph.
|
||||
text = re.sub('(?msu)^[ ]{1,3}', '', text)
|
||||
text = re.sub(r'(?msu)^[ ]{1,3}', '', text)
|
||||
# pre has 4 spaces. We trimmed 3 so anything with a space left is a pre.
|
||||
text = re.sub('(?msu)^[ ]', ' ', text)
|
||||
text = re.sub(r'(?msu)^[ ]', ' ', text)
|
||||
|
||||
# Remove tabs that aren't at the beginning of a line
|
||||
new_text = []
|
||||
@ -68,7 +68,7 @@ class MarkdownMLizer(OEB2HTML):
|
||||
text = '\n'.join(new_text)
|
||||
|
||||
# Remove spaces from blank lines.
|
||||
text = re.sub('(?msu)^[ ]+$', '', text)
|
||||
text = re.sub(r'(?msu)^[ ]+$', '', text)
|
||||
|
||||
# Reduce blank lines
|
||||
text = re.sub('(?msu)\n{7,}', '\n' * 6, text)
|
||||
|
@ -34,7 +34,7 @@ def clean_txt(txt):
|
||||
txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt)
|
||||
|
||||
# Condense redundant spaces
|
||||
txt = re.sub('[ ]{2,}', ' ', txt)
|
||||
txt = re.sub(r'[ ]{2,}', ' ', txt)
|
||||
|
||||
# Remove blank space from the beginning and end of the document.
|
||||
txt = re.sub(r'^\s+(?=.)', '', txt)
|
||||
@ -213,7 +213,7 @@ def preserve_spaces(txt):
|
||||
'''
|
||||
Replaces spaces multiple spaces with entities.
|
||||
'''
|
||||
txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt)
|
||||
txt = re.sub(r'(?P<space>[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt)
|
||||
txt = txt.replace('\t', ' ')
|
||||
return txt
|
||||
|
||||
@ -325,9 +325,9 @@ def detect_formatting_type(txt):
|
||||
|
||||
# Check for markdown
|
||||
# Headings
|
||||
markdown_count += len(re.findall('(?mu)^#+', txt))
|
||||
markdown_count += len(re.findall('(?mu)^=+$', txt))
|
||||
markdown_count += len(re.findall('(?mu)^-+$', txt))
|
||||
markdown_count += len(re.findall(r'(?mu)^#+', txt))
|
||||
markdown_count += len(re.findall(r'(?mu)^=+$', txt))
|
||||
markdown_count += len(re.findall(r'(?mu)^-+$', txt))
|
||||
# Images
|
||||
markdown_count += len(re.findall(r'(?u)!\[.*?\](\[|\()', txt))
|
||||
# Links
|
||||
|
@ -126,7 +126,7 @@ class TXTMLizer:
|
||||
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
||||
|
||||
# Remove multiple spaces.
|
||||
text = re.sub('[ ]{2,}', ' ', text)
|
||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||
|
||||
# Remove excessive newlines.
|
||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||
@ -140,8 +140,8 @@ class TXTMLizer:
|
||||
# Replace spaces at the beginning and end of lines
|
||||
# We don't replace tabs because those are only added
|
||||
# when remove paragraph spacing is enabled.
|
||||
text = re.sub('(?imu)^[ ]+', '', text)
|
||||
text = re.sub('(?imu)[ ]+$', '', text)
|
||||
text = re.sub(r'(?imu)^[ ]+', '', text)
|
||||
text = re.sub(r'(?imu)[ ]+$', '', text)
|
||||
|
||||
# Remove empty space and newlines at the beginning of the document.
|
||||
text = re.sub(r'(?u)^[ \n]+', '', text)
|
||||
|
@ -406,7 +406,7 @@ class SearchDialog(QDialog):
|
||||
self.resize(self.sizeHint())
|
||||
|
||||
def retrieve_template_search(self):
|
||||
template, sep, query = re.split('#@#:([tdnb]):', self.current_search_text, flags=re.IGNORECASE)
|
||||
template, sep, query = re.split(r'#@#:([tdnb]):', self.current_search_text, flags=re.IGNORECASE)
|
||||
self.template_value_box.setText(query)
|
||||
cb = self.template_test_type_box
|
||||
for idx in range(0, cb.count()):
|
||||
|
@ -744,7 +744,7 @@ class CreateCustomColumn(QDialog):
|
||||
return self.simple_error('', _('The colors box must be empty or '
|
||||
'contain the same number of items as the value box'))
|
||||
for tc in c:
|
||||
if tc not in QColor.colorNames() and not re.match('#(?:[0-9a-f]{3}){1,4}',tc,re.I):
|
||||
if tc not in QColor.colorNames() and not re.match(r'#(?:[0-9a-f]{3}){1,4}',tc,re.I):
|
||||
return self.simple_error('', _('The color {0} is unknown').format(tc))
|
||||
display_dict = {'enum_values': l, 'enum_colors': c}
|
||||
if default_val:
|
||||
|
@ -146,7 +146,7 @@ class EmailAccounts(QAbstractTableModel): # {{{
|
||||
if aval:
|
||||
self.tags[account] = aval
|
||||
elif col == 1:
|
||||
self.accounts[account][0] = re.sub(',+', ',', re.sub(r'\s+', ',', as_unicode(value or '').upper()))
|
||||
self.accounts[account][0] = re.sub(r',+', ',', re.sub(r'\s+', ',', as_unicode(value or '').upper()))
|
||||
elif col == 0:
|
||||
na = as_unicode(value or '').strip()
|
||||
from email.utils import parseaddr
|
||||
|
@ -920,6 +920,6 @@ if __name__ == '__main__': # {{{
|
||||
|
||||
def callback(ed):
|
||||
import regex
|
||||
ed.find_text(regex.compile('A bold word'))
|
||||
ed.find_text(regex.compile(r'A bold word'))
|
||||
launch_editor(raw, path_is_raw=True, syntax='html', callback=callback)
|
||||
# }}}
|
||||
|
@ -3828,7 +3828,7 @@ class CatalogBuilder:
|
||||
# if self.opts.numbers_as_text and re.match('[0-9]+',word[0]):
|
||||
translated.append(NumberToText(word).text.capitalize())
|
||||
else:
|
||||
if re.match('[0-9]+', word[0]):
|
||||
if re.match(r'[0-9]+', word[0]):
|
||||
word = word.replace(',', '')
|
||||
suffix = re.search(r'[\D]', word)
|
||||
if suffix:
|
||||
@ -3844,7 +3844,7 @@ class CatalogBuilder:
|
||||
translated.append(capitalize(word))
|
||||
|
||||
else:
|
||||
if re.search('[0-9]+', word[0]):
|
||||
if re.search(r'[0-9]+', word[0]):
|
||||
word = word.replace(',', '')
|
||||
suffix = re.search(r'[\D]', word)
|
||||
if suffix:
|
||||
@ -4114,7 +4114,7 @@ class CatalogBuilder:
|
||||
Return:
|
||||
(str): char if A-z, else SYMBOLS
|
||||
'''
|
||||
if not re.search('[a-zA-Z]', ascii_text(char)):
|
||||
if not re.search(r'[a-zA-Z]', ascii_text(char)):
|
||||
return self.SYMBOLS
|
||||
else:
|
||||
return char
|
||||
|
@ -87,7 +87,7 @@ class NumberToText: # {{{
|
||||
self.log('numberTranslate(): %s' % self.number)
|
||||
|
||||
# Special case ordinals
|
||||
if re.search('[st|nd|rd|th]',self.number):
|
||||
if re.search(r'[st|nd|rd|th]',self.number):
|
||||
self.number = self.number.replace(',', '')
|
||||
ordinal_suffix = re.search(r'[\D]', self.number)
|
||||
ordinal_number = re.sub(r'\D','',self.number.replace(',', ''))
|
||||
@ -134,7 +134,7 @@ class NumberToText: # {{{
|
||||
self.log('Hyphenated: %s' % self.number)
|
||||
self.number_as_float = self.number.split('-')[0]
|
||||
strings = self.number.split('-')
|
||||
if re.search('[0-9]+', strings[0]):
|
||||
if re.search(r'[0-9]+', strings[0]):
|
||||
left = NumberToText(strings[0]).text
|
||||
right = strings[1]
|
||||
else:
|
||||
@ -143,7 +143,7 @@ class NumberToText: # {{{
|
||||
self.text = f'{left}-{right}'
|
||||
|
||||
# Test for only commas and numbers
|
||||
elif ',' in self.number and not re.search('[^0-9,]',self.number):
|
||||
elif ',' in self.number and not re.search(r'[^0-9,]',self.number):
|
||||
if self.verbose:
|
||||
self.log('Comma(s): %s' % self.number)
|
||||
self.number_as_float = self.number.replace(',', '')
|
||||
|
@ -1504,11 +1504,11 @@ def text_to_tokens(text):
|
||||
text = match.group(1)
|
||||
OR = True
|
||||
tokens = []
|
||||
quot = re.search('"(.*?)"', text)
|
||||
quot = re.search(r'"(.*?)"', text)
|
||||
while quot:
|
||||
tokens.append(quot.group(1))
|
||||
text = text.replace('"'+quot.group(1)+'"', '')
|
||||
quot = re.search('"(.*?)"', text)
|
||||
quot = re.search(r'"(.*?)"', text)
|
||||
tokens += text.split(' ')
|
||||
ans = []
|
||||
for i in tokens:
|
||||
|
@ -2556,7 +2556,7 @@ class BibTeX:
|
||||
self.invalid_cit = re.compile('[ "@\',\\#}{~%&$^]')
|
||||
self.upper = re.compile('[' +
|
||||
string.ascii_uppercase + ']')
|
||||
self.escape = re.compile('[#&%_]')
|
||||
self.escape = re.compile(r'[#&%_]')
|
||||
|
||||
def ValidateCitationKey(self, text):
|
||||
'''
|
||||
|
@ -59,7 +59,7 @@ def get_opts_from_parser(parser, prefix):
|
||||
|
||||
|
||||
def send(ans):
|
||||
pat = re.compile('([^0-9a-zA-Z_./-])')
|
||||
pat = re.compile(r'([^0-9a-zA-Z_./-])')
|
||||
for x in sorted(set(ans)):
|
||||
x = pat.sub(lambda m : '\\'+m.group(1), x)
|
||||
if x.endswith('\\ '):
|
||||
|
@ -384,7 +384,7 @@ def format_date(dt, format, assume_utc=False, as_utc=False):
|
||||
|
||||
repl_func = partial(fd_repl_func, dt, 'ap' in format.lower())
|
||||
return re.sub(
|
||||
'(s{1,2})|(m{1,2})|(h{1,2})|(ap)|(AP)|(d{1,4}|M{1,4}|(?:yyyy|yy))',
|
||||
r'(s{1,2})|(m{1,2})|(h{1,2})|(ap)|(AP)|(d{1,4}|M{1,4}|(?:yyyy|yy))',
|
||||
repl_func, format)
|
||||
|
||||
# }}}
|
||||
@ -460,7 +460,7 @@ def clean_date_for_sort(dt, fmt=None):
|
||||
'min':UNDEFINED_DATE.minute, 'sec':UNDEFINED_DATE.second}
|
||||
|
||||
repl_func = partial(cd_repl_func, tt, dt)
|
||||
re.sub('(s{1,2})|(m{1,2})|(h{1,2})|(d{1,4}|M{1,4}|(?:yyyy|yy))', repl_func, fmt)
|
||||
re.sub(r'(s{1,2})|(m{1,2})|(h{1,2})|(d{1,4}|M{1,4}|(?:yyyy|yy))', repl_func, fmt)
|
||||
return dt.replace(year=tt['year'], month=tt['mon'], day=tt['day'], hour=tt['hour'],
|
||||
minute=tt['min'], second=tt['sec'], microsecond=0)
|
||||
# }}}
|
||||
|
@ -90,7 +90,7 @@ def get_system_locale():
|
||||
|
||||
def sanitize_lang(lang):
|
||||
if lang:
|
||||
match = re.match('[a-z]{2,3}(_[A-Z]{2}){0,1}', lang)
|
||||
match = re.match(r'[a-z]{2,3}(_[A-Z]{2}){0,1}', lang)
|
||||
if match:
|
||||
lang = match.group()
|
||||
if lang == 'zh':
|
||||
|
@ -195,7 +195,7 @@ class Parser:
|
||||
def tokenize(self, expr):
|
||||
# convert docstrings to base64 to avoid all processing. Change the docstring
|
||||
# indicator to something unique with no characters special to the parser.
|
||||
expr = re.sub('(""")(..*?)(""")',
|
||||
expr = re.sub(r'(""")(..*?)(""")',
|
||||
lambda mo: self.docstring_sep + as_hex_unicode(mo.group(2)) + self.docstring_sep,
|
||||
expr, flags=re.DOTALL)
|
||||
|
||||
|
@ -1730,7 +1730,7 @@ class BasicNewsRecipe(Recipe):
|
||||
|
||||
def error_in_article_download(self, request, traceback):
|
||||
self.jobs_done += 1
|
||||
if traceback and re.search('^AbortArticle:', traceback, flags=re.M) is not None:
|
||||
if traceback and re.search(r'^AbortArticle:', traceback, flags=re.M) is not None:
|
||||
self.log.warn('Aborted download of article:', request.article.title,
|
||||
'from', request.article.url)
|
||||
self.report_progress(float(self.jobs_done)/len(self.jobs),
|
||||
|
@ -59,7 +59,7 @@ def styleFromList(styleName, specArray, spacing, showAllLevels):
|
||||
numbered = False
|
||||
displayLevels = 0
|
||||
listStyle = ListStyle(name=styleName)
|
||||
numFormatPattern = re.compile('([1IiAa])')
|
||||
numFormatPattern = re.compile(r'([1IiAa])')
|
||||
cssLengthPattern = re.compile('([^a-z]+)\\s*([a-z]+)?')
|
||||
m = cssLengthPattern.search(spacing)
|
||||
if (m is not None):
|
||||
|
Loading…
x
Reference in New Issue
Block a user