mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-07 09:01:38 -04:00
always use raw-string for regex (auto-fix)
ruff 'RUF039'
This commit is contained in:
parent
567a0187f3
commit
ac6912565a
@ -49,7 +49,7 @@ def merge():
|
|||||||
clone_node(child, symbol)
|
clone_node(child, symbol)
|
||||||
ans.append(symbol)
|
ans.append(symbol)
|
||||||
ans = etree.tostring(ans, encoding='unicode', pretty_print=True, with_tail=False)
|
ans = etree.tostring(ans, encoding='unicode', pretty_print=True, with_tail=False)
|
||||||
ans = re.sub('<svg[^>]+>', '<svg style="display:none">', ans, count=1)
|
ans = re.sub(r'<svg[^>]+>', '<svg style="display:none">', ans, count=1)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,6 +29,6 @@ class AlejaKomiksu(BasicNewsRecipe):
|
|||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
tag = soup.find(attrs={'class': 'rodzaj'})
|
tag = soup.find(attrs={'class': 'rodzaj'})
|
||||||
if tag and tag.a.string.lower().strip() == 'recenzje':
|
if tag and tag.a.string.lower().strip() == 'recenzje':
|
||||||
link = soup.find(text=re.compile('recenzuje'))
|
link = soup.find(text=re.compile(r'recenzuje'))
|
||||||
if link:
|
if link:
|
||||||
return self.index_to_soup(link.parent['href'], raw=True)
|
return self.index_to_soup(link.parent['href'], raw=True)
|
||||||
|
@ -63,12 +63,12 @@ class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
|||||||
dict(
|
dict(
|
||||||
attrs={'class': ['socialbar', 'social-sharing flank', 'vel', 'back']}),
|
attrs={'class': ['socialbar', 'social-sharing flank', 'vel', 'back']}),
|
||||||
dict(name='img', attrs={'alt': 'logo'}),
|
dict(name='img', attrs={'alt': 'logo'}),
|
||||||
dict(name='div', attrs={'class': re.compile('infoEl')}),
|
dict(name='div', attrs={'class': re.compile(r'infoEl')}),
|
||||||
dict(name='span', attrs={'class': re.compile('loupe')})
|
dict(name='span', attrs={'class': re.compile(r'loupe')})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags_after = [
|
remove_tags_after = [
|
||||||
dict(name='div', attrs={'itemprop': re.compile('articleBody')})
|
dict(name='div', attrs={'itemprop': re.compile(r'articleBody')})
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
@ -58,7 +58,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://www.birminghammail.co.uk')
|
soup = self.index_to_soup('http://www.birminghammail.co.uk')
|
||||||
cov = soup.find(attrs={'src': re.compile(
|
cov = soup.find(attrs={'src': re.compile(
|
||||||
'http://images.icnetwork.co.uk/upl/birm')})
|
r'http://images.icnetwork.co.uk/upl/birm')})
|
||||||
cov = str(cov)
|
cov = str(cov)
|
||||||
cov2 = re.findall(
|
cov2 = re.findall(
|
||||||
r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
||||||
|
@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||||
|
|
||||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||||
@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
name='div', attrs={'class': 'copyright'}),
|
name='div', attrs={'class': 'copyright'}),
|
||||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||||
dict(name='div', attrs={'id': 'soundoff'}),
|
dict(name='div', attrs={'id': 'soundoff'}),
|
||||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
@ -39,12 +39,12 @@ class CSMonitor(BasicNewsRecipe):
|
|||||||
}
|
}
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['meta', 'link', 'iframe', 'object', 'embed']), dict(attrs={'class': re.compile('(^|| )podStoryRel($|| )', re.DOTALL)}), dict(
|
dict(name=['meta', 'link', 'iframe', 'object', 'embed']), dict(attrs={'class': re.compile(r'(^|| )podStoryRel($|| )', re.DOTALL)}), dict(
|
||||||
attrs={'class': ['bottom-rel', 'hide']}), dict(attrs={'id': ['pgallerycarousel_enlarge', 'pgallerycarousel_related']})
|
attrs={'class': ['bottom-rel', 'hide']}), dict(attrs={'id': ['pgallerycarousel_enlarge', 'pgallerycarousel_related']})
|
||||||
]
|
]
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='h1', attrs={'class': 'head'}), dict(name='h2', attrs={'class': 'subhead'}), dict(attrs={'class': [
|
dict(name='h1', attrs={'class': 'head'}), dict(name='h2', attrs={'class': 'subhead'}), dict(attrs={'class': [
|
||||||
'sByline', 'thePhoto', 'ui-body-header']}), dict(attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)})
|
'sByline', 'thePhoto', 'ui-body-header']}), dict(attrs={'class': re.compile(r'(^|| )sBody($|| )', re.DOTALL)})
|
||||||
]
|
]
|
||||||
remove_attributes = ['xmlns:fb']
|
remove_attributes = ['xmlns:fb']
|
||||||
|
|
||||||
@ -74,11 +74,11 @@ class CSMonitor(BasicNewsRecipe):
|
|||||||
nurl = 'http://www.csmonitor.com' + nexttag['href']
|
nurl = 'http://www.csmonitor.com' + nexttag['href']
|
||||||
soup2 = self.index_to_soup(nurl)
|
soup2 = self.index_to_soup(nurl)
|
||||||
texttag = soup2.find(
|
texttag = soup2.find(
|
||||||
attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)})
|
attrs={'class': re.compile(r'(^|| )sBody($|| )', re.DOTALL)})
|
||||||
if texttag:
|
if texttag:
|
||||||
appendtag = soup.find(
|
appendtag = soup.find(
|
||||||
attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)})
|
attrs={'class': re.compile(r'(^|| )sBody($|| )', re.DOTALL)})
|
||||||
for citem in texttag.findAll(attrs={'class': [re.compile('(^|| )podStoryRel($|| )', re.DOTALL), 'bottom-rel', 'hide']}):
|
for citem in texttag.findAll(attrs={'class': [re.compile(r'(^|| )podStoryRel($|| )', re.DOTALL), 'bottom-rel', 'hide']}):
|
||||||
citem.extract()
|
citem.extract()
|
||||||
self.append_page(soup2)
|
self.append_page(soup2)
|
||||||
texttag.extract()
|
texttag.extract()
|
||||||
|
@ -47,7 +47,7 @@ class Chronicle(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Find cover
|
# Find cover
|
||||||
cover = soup0.find('div', attrs={
|
cover = soup0.find('div', attrs={
|
||||||
'class': 'side-content'}).find(attrs={'src': re.compile('photos/biz/Current')})
|
'class': 'side-content'}).find(attrs={'src': re.compile(r'photos/biz/Current')})
|
||||||
if cover is not None:
|
if cover is not None:
|
||||||
if 'chronicle.com' in cover['src']:
|
if 'chronicle.com' in cover['src']:
|
||||||
self.cover_url = cover['src']
|
self.cover_url = cover['src']
|
||||||
|
@ -86,7 +86,7 @@ class CourrierInternational(BasicNewsRecipe):
|
|||||||
return br
|
return br
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for link in soup.findAll('a', href=re.compile('^/')):
|
for link in soup.findAll('a', href=re.compile(r'^/')):
|
||||||
link['href'] = 'http://www.courrierinternational.com' + link['href']
|
link['href'] = 'http://www.courrierinternational.com' + link['href']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -71,10 +71,10 @@ class AdvancedUserRecipe1467571059(BasicNewsRecipe):
|
|||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['embed', 'object']),
|
dict(name=['embed', 'object']),
|
||||||
dict(name='div', attrs={'class':['note NotePortrait', 'note']}),
|
dict(name='div', attrs={'class':['note NotePortrait', 'note']}),
|
||||||
dict(name='ul', attrs={'class':re.compile('article__share')}),
|
dict(name='ul', attrs={'class':re.compile(r'article__share')}),
|
||||||
dict(name='div', attrs={'class':'slideshow__controls'}),
|
dict(name='div', attrs={'class':'slideshow__controls'}),
|
||||||
dict(name='a', attrs={'role':'button'}),
|
dict(name='a', attrs={'role':'button'}),
|
||||||
dict(name='figure', attrs={'class':re.compile('video')})
|
dict(name='figure', attrs={'class':re.compile(r'video')})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_attributes = ['width', 'height']
|
remove_attributes = ['width', 'height']
|
||||||
|
@ -31,9 +31,9 @@ class deredactie(BasicNewsRecipe):
|
|||||||
catnames = {}
|
catnames = {}
|
||||||
soup = self.index_to_soup(
|
soup = self.index_to_soup(
|
||||||
'http://www.deredactie.be/cm/vrtnieuws.deutsch')
|
'http://www.deredactie.be/cm/vrtnieuws.deutsch')
|
||||||
for elem in soup.findAll('li', attrs={'id': re.compile('^navItem[2-9]')}):
|
for elem in soup.findAll('li', attrs={'id': re.compile(r'^navItem[2-9]')}):
|
||||||
a = elem.find('a', href=True)
|
a = elem.find('a', href=True)
|
||||||
m = re.search('(?<=/)[^/]*$', a['href'])
|
m = re.search(r'(?<=/)[^/]*$', a['href'])
|
||||||
cat = str(m.group(0))
|
cat = str(m.group(0))
|
||||||
categories.append(cat)
|
categories.append(cat)
|
||||||
catnames[cat] = a['title']
|
catnames[cat] = a['title']
|
||||||
@ -45,7 +45,7 @@ class deredactie(BasicNewsRecipe):
|
|||||||
articles = []
|
articles = []
|
||||||
soup = self.index_to_soup(
|
soup = self.index_to_soup(
|
||||||
'http://www.deredactie.be/cm/vrtnieuws.deutsch/' + cat)
|
'http://www.deredactie.be/cm/vrtnieuws.deutsch/' + cat)
|
||||||
for a in soup.findAll('a', attrs={'href': re.compile('deutsch.*/[0-9][0-9][0-9][0-9][0-9][0-9]_')}):
|
for a in soup.findAll('a', attrs={'href': re.compile(r'deutsch.*/[0-9][0-9][0-9][0-9][0-9][0-9]_')}):
|
||||||
skip_this_article = False
|
skip_this_article = False
|
||||||
url = a['href'].strip()
|
url = a['href'].strip()
|
||||||
if url.startswith('/'):
|
if url.startswith('/'):
|
||||||
|
@ -51,7 +51,7 @@ class Donga(BasicNewsRecipe):
|
|||||||
# https://www.donga.com/news/[sections]/article/all/[date]/[gid]/1
|
# https://www.donga.com/news/[sections]/article/all/[date]/[gid]/1
|
||||||
# Return print version url with syntax:
|
# Return print version url with syntax:
|
||||||
# https://www.donga.com/news/View?gid=[gid]&date=[date]
|
# https://www.donga.com/news/View?gid=[gid]&date=[date]
|
||||||
reobject = re.search('(?<=/all/)([0-9]*)/([0-9]*)', url)
|
reobject = re.search(r'(?<=/all/)([0-9]*)/([0-9]*)', url)
|
||||||
date = reobject.group(1)
|
date = reobject.group(1)
|
||||||
gid = reobject.group(2)
|
gid = reobject.group(2)
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ class dwutygodnik(BasicNewsRecipe):
|
|||||||
browser.open('http://www.dwutygodnik.com/')
|
browser.open('http://www.dwutygodnik.com/')
|
||||||
|
|
||||||
# find the link
|
# find the link
|
||||||
epublink = browser.find_link(text_regex=re.compile('Wydanie EPUB'))
|
epublink = browser.find_link(text_regex=re.compile(r'Wydanie EPUB'))
|
||||||
|
|
||||||
# download ebook
|
# download ebook
|
||||||
self.report_progress(0, _('Downloading ePUB'))
|
self.report_progress(0, _('Downloading ePUB'))
|
||||||
|
@ -21,8 +21,8 @@ class Dziennik_pl(BasicNewsRecipe):
|
|||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
extra_css = 'ul {list-style: none; padding: 0; margin: 0;} .foto {float: left;} .clr {clear: both;}'
|
extra_css = 'ul {list-style: none; padding: 0; margin: 0;} .foto {float: left;} .clr {clear: both;}'
|
||||||
preprocess_regexps = [(re.compile('Komentarze:'), lambda m: ''), (re.compile(
|
preprocess_regexps = [(re.compile(r'Komentarze:'), lambda m: ''), (re.compile(
|
||||||
'<p><strong><a href=".*?">>>> CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
|
r'<p><strong><a href=".*?">>>> CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
|
||||||
keep_only_tags = [dict(id='article')]
|
keep_only_tags = [dict(id='article')]
|
||||||
remove_tags = [dict(name='div', attrs={'class': ['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class': ['komentarz', 'article_icon_addcommnent']}), dict(name='ins'), dict(name='br')] # noqa: E501
|
remove_tags = [dict(name='div', attrs={'class': ['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class': ['komentarz', 'article_icon_addcommnent']}), dict(name='ins'), dict(name='br')] # noqa: E501
|
||||||
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
|
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
|
||||||
|
@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||||
|
|
||||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||||
@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
name='div', attrs={'class': 'copyright'}),
|
name='div', attrs={'class': 'copyright'}),
|
||||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||||
dict(name='div', attrs={'id': 'soundoff'}),
|
dict(name='div', attrs={'id': 'soundoff'}),
|
||||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
@ -51,7 +51,7 @@ class Esensja(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('http://www.esensja.pl/magazyn/')
|
soup = self.index_to_soup('http://www.esensja.pl/magazyn/')
|
||||||
a = soup.find('a', attrs={'href': re.compile('.*/index.html')})
|
a = soup.find('a', attrs={'href': re.compile(r'.*/index.html')})
|
||||||
year = a['href'].split('/')[0]
|
year = a['href'].split('/')[0]
|
||||||
month = a['href'].split('/')[1]
|
month = a['href'].split('/')[1]
|
||||||
self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/'
|
self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/'
|
||||||
@ -149,7 +149,7 @@ class Esensja(BasicNewsRecipe):
|
|||||||
info = tag.find(attrs={'class': 'img_info'})
|
info = tag.find(attrs={'class': 'img_info'})
|
||||||
text = str(tag)
|
text = str(tag)
|
||||||
if not src:
|
if not src:
|
||||||
src = re.search('src="[^"]*?"', text)
|
src = re.search(r'src="[^"]*?"', text)
|
||||||
if src:
|
if src:
|
||||||
src = src.group(0)
|
src = src.group(0)
|
||||||
src = src[5:].replace('//', '/')
|
src = src[5:].replace('//', '/')
|
||||||
|
@ -95,7 +95,7 @@ class EsensjaRSS(BasicNewsRecipe):
|
|||||||
info = tag.find(attrs={'class': 'img_info'})
|
info = tag.find(attrs={'class': 'img_info'})
|
||||||
text = str(tag)
|
text = str(tag)
|
||||||
if not src:
|
if not src:
|
||||||
src = re.search('src="[^"]*?"', text)
|
src = re.search(r'src="[^"]*?"', text)
|
||||||
if src:
|
if src:
|
||||||
src = src.group(0)
|
src = src.group(0)
|
||||||
src = src[5:].replace('//', '/')
|
src = src[5:].replace('//', '/')
|
||||||
|
@ -109,7 +109,7 @@ img { background: none !important; float: none; margin: 0px; }
|
|||||||
|
|
||||||
for post in soup.findAll('a'):
|
for post in soup.findAll('a'):
|
||||||
strpost = str(post)
|
strpost = str(post)
|
||||||
if re.match('<a href="https://www1.folha.uol.com.br/.*/"><svg aria-hidden="true" class="icon icon--star"', strpost):
|
if re.match(r'<a href="https://www1.folha.uol.com.br/.*/"><svg aria-hidden="true" class="icon icon--star"', strpost):
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((section_title, articles))
|
feeds.append((section_title, articles))
|
||||||
self.log()
|
self.log()
|
||||||
|
@ -39,7 +39,7 @@ class AdvancedUserRecipe1515196393(BasicNewsRecipe):
|
|||||||
feeds = []
|
feeds = []
|
||||||
br = self.get_browser()
|
br = self.get_browser()
|
||||||
self.ctdir = PersistentTemporaryDirectory()
|
self.ctdir = PersistentTemporaryDirectory()
|
||||||
for x in toc.findAll(['li'], attrs={'class': re.compile('.*get_content.*')}):
|
for x in toc.findAll(['li'], attrs={'class': re.compile(r'.*get_content.*')}):
|
||||||
edwo = x.find('a')
|
edwo = x.find('a')
|
||||||
title = self.tag_to_string(edwo)
|
title = self.tag_to_string(edwo)
|
||||||
self.log('\t\tFound article:', title)
|
self.log('\t\tFound article:', title)
|
||||||
|
@ -54,7 +54,7 @@ class GN(BasicNewsRecipe):
|
|||||||
}]
|
}]
|
||||||
feeds.append((u'Na dobry początek', articles))
|
feeds.append((u'Na dobry początek', articles))
|
||||||
# columns:
|
# columns:
|
||||||
for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}):
|
for addr in soup.findAll('a', attrs={'href': re.compile(r'kategoria')}):
|
||||||
if not addr.span:
|
if not addr.span:
|
||||||
main_block = self.index_to_soup(
|
main_block = self.index_to_soup(
|
||||||
'http://www.gosc.pl' + addr['href'])
|
'http://www.gosc.pl' + addr['href'])
|
||||||
|
@ -50,7 +50,7 @@ class GN(BasicNewsRecipe):
|
|||||||
}]
|
}]
|
||||||
feeds.append((u'Na dobry początek', articles))
|
feeds.append((u'Na dobry początek', articles))
|
||||||
# columns:
|
# columns:
|
||||||
for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}):
|
for addr in soup.findAll('a', attrs={'href': re.compile(r'kategoria')}):
|
||||||
if not addr.span:
|
if not addr.span:
|
||||||
main_block = self.index_to_soup(
|
main_block = self.index_to_soup(
|
||||||
'http://www.gosc.pl' + addr['href'])
|
'http://www.gosc.pl' + addr['href'])
|
||||||
|
@ -50,10 +50,10 @@ class GazetvanAntwerpen(BasicNewsRecipe):
|
|||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['embed', 'object']),
|
dict(name=['embed', 'object']),
|
||||||
dict(name='div', attrs={'class': ['note NotePortrait', 'note']}),
|
dict(name='div', attrs={'class': ['note NotePortrait', 'note']}),
|
||||||
dict(name='ul', attrs={'class': re.compile('article__share')}),
|
dict(name='ul', attrs={'class': re.compile(r'article__share')}),
|
||||||
dict(name='div', attrs={'class': 'slideshow__controls'}),
|
dict(name='div', attrs={'class': 'slideshow__controls'}),
|
||||||
dict(name='a', attrs={'role': 'button'}),
|
dict(name='a', attrs={'role': 'button'}),
|
||||||
dict(name='figure', attrs={'class': re.compile('video')})
|
dict(name='figure', attrs={'class': re.compile(r'video')})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_attributes = ['width', 'height']
|
remove_attributes = ['width', 'height']
|
||||||
|
@ -78,7 +78,7 @@ class HNWithCommentsLink(BasicNewsRecipe):
|
|||||||
br = td.find('br')
|
br = td.find('br')
|
||||||
if br:
|
if br:
|
||||||
br.extract()
|
br.extract()
|
||||||
reply = td.find('a', attrs={'href': re.compile('^reply?')})
|
reply = td.find('a', attrs={'href': re.compile(r'^reply?')})
|
||||||
if reply:
|
if reply:
|
||||||
reply.parent.extract()
|
reply.parent.extract()
|
||||||
td.name = 'div'
|
td.name = 'div'
|
||||||
|
@ -59,7 +59,7 @@ class Handelsblatt(BasicNewsRecipe):
|
|||||||
dict(name='aside', attrs={'class': ['vhb-article-element vhb-left',
|
dict(name='aside', attrs={'class': ['vhb-article-element vhb-left',
|
||||||
'vhb-article-element vhb-left vhb-teasergallery',
|
'vhb-article-element vhb-left vhb-teasergallery',
|
||||||
'vhb-article-element vhb-left vhb-shorttexts']}),
|
'vhb-article-element vhb-left vhb-shorttexts']}),
|
||||||
dict(name='aside', attrs={'class': re.compile('vhb-club-events')}),
|
dict(name='aside', attrs={'class': re.compile(r'vhb-club-events')}),
|
||||||
dict(name='article', attrs={'class': ['vhb-imagegallery vhb-teaser',
|
dict(name='article', attrs={'class': ['vhb-imagegallery vhb-teaser',
|
||||||
'vhb-teaser vhb-type-video']}),
|
'vhb-teaser vhb-type-video']}),
|
||||||
dict(name='small', attrs={'class': ['vhb-credit']}),
|
dict(name='small', attrs={'class': ['vhb-credit']}),
|
||||||
@ -70,14 +70,14 @@ class Handelsblatt(BasicNewsRecipe):
|
|||||||
'opinary-widget-wrapper',
|
'opinary-widget-wrapper',
|
||||||
'vhb-article__content-element--shorttextgallery',
|
'vhb-article__content-element--shorttextgallery',
|
||||||
'vhb-hollow-area vhb-hollow-area--col-1']}),
|
'vhb-hollow-area vhb-hollow-area--col-1']}),
|
||||||
dict(name='div', attrs={'class': re.compile('stepstone')}),
|
dict(name='div', attrs={'class': re.compile(r'stepstone')}),
|
||||||
dict(name='div', attrs={'class': re.compile('vhb-imagegallery')}),
|
dict(name='div', attrs={'class': re.compile(r'vhb-imagegallery')}),
|
||||||
dict(name='div', attrs={'id': ['highcharts_infografik']}),
|
dict(name='div', attrs={'id': ['highcharts_infografik']}),
|
||||||
dict(name='div', attrs={'id': re.compile('dax-sentiment')}),
|
dict(name='div', attrs={'id': re.compile(r'dax-sentiment')}),
|
||||||
dict(name=['div', 'section'], attrs={'class': re.compile('slider')}),
|
dict(name=['div', 'section'], attrs={'class': re.compile(r'slider')}),
|
||||||
dict(name='a', attrs={'class': ['twitter-follow-button']}),
|
dict(name='a', attrs={'class': ['twitter-follow-button']}),
|
||||||
dict(name='img', attrs={'class': ['highlight-icon', 'lb-author__avatar', 'pin-icon']}),
|
dict(name='img', attrs={'class': ['highlight-icon', 'lb-author__avatar', 'pin-icon']}),
|
||||||
dict(name='img', attrs={'alt': re.compile('Handelsblatt Morning Briefing')}),
|
dict(name='img', attrs={'alt': re.compile(r'Handelsblatt Morning Briefing')}),
|
||||||
dict(name=['blockquote', 'button', 'link'])
|
dict(name=['blockquote', 'button', 'link'])
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -138,7 +138,7 @@ class Handelsblatt(BasicNewsRecipe):
|
|||||||
|
|
||||||
def postprocess_html(self, soup, first_fetch):
|
def postprocess_html(self, soup, first_fetch):
|
||||||
# convert lists of author(s) and date(s) into simple text
|
# convert lists of author(s) and date(s) into simple text
|
||||||
for cap in soup.find_all('div', {'class': re.compile('vhb-article-caption')}):
|
for cap in soup.find_all('div', {'class': re.compile(r'vhb-article-caption')}):
|
||||||
cap.replace_with(cap.encode_contents().decode('utf-8').strip() + ' ')
|
cap.replace_with(cap.encode_contents().decode('utf-8').strip() + ' ')
|
||||||
for row in soup.find_all('div', {'class': 'vhb-article-author-row'}):
|
for row in soup.find_all('div', {'class': 'vhb-article-author-row'}):
|
||||||
for ul in row.find_all('ul'):
|
for ul in row.find_all('ul'):
|
||||||
@ -160,7 +160,7 @@ class Handelsblatt(BasicNewsRecipe):
|
|||||||
fig.find('div', {'class': 'vhb-caption'}).replace_with(cap)
|
fig.find('div', {'class': 'vhb-caption'}).replace_with(cap)
|
||||||
# remove references to related articles
|
# remove references to related articles
|
||||||
for strong in soup.find_all('strong'):
|
for strong in soup.find_all('strong'):
|
||||||
if strong.string and (re.match('^Mehr:? ?', strong.string) or re.match('^>>.*', strong.string)):
|
if strong.string and (re.match(r'^Mehr:? ?', strong.string) or re.match(r'^>>.*', strong.string)):
|
||||||
p_parent = strong.find_parent('p')
|
p_parent = strong.find_parent('p')
|
||||||
if p_parent:
|
if p_parent:
|
||||||
p_parent.decompose()
|
p_parent.decompose()
|
||||||
|
@ -49,7 +49,7 @@ class HistoryToday(BasicNewsRecipe):
|
|||||||
# Go to issue
|
# Go to issue
|
||||||
soup = self.index_to_soup('https://www.historytoday.com/contents')
|
soup = self.index_to_soup('https://www.historytoday.com/contents')
|
||||||
cover = soup.find('div', attrs={
|
cover = soup.find('div', attrs={
|
||||||
'id': 'content-area'}).find('img', attrs={'src': re.compile('.*cover.*')})['src']
|
'id': 'content-area'}).find('img', attrs={'src': re.compile(r'.*cover.*')})['src']
|
||||||
self.cover_url = cover
|
self.cover_url = cover
|
||||||
self.log(self.cover_url)
|
self.log(self.cover_url)
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ class IndiaToday(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, *a):
|
def preprocess_raw_html(self, raw, *a):
|
||||||
m = re.search('id="__NEXT_DATA__" type="application/json">', raw)
|
m = re.search(r'id="__NEXT_DATA__" type="application/json">', raw)
|
||||||
raw = raw[m.start():]
|
raw = raw[m.start():]
|
||||||
raw = raw.split('>', 1)[1]
|
raw = raw.split('>', 1)[1]
|
||||||
data = json.JSONDecoder().raw_decode(raw)[0]
|
data = json.JSONDecoder().raw_decode(raw)[0]
|
||||||
|
@ -36,7 +36,7 @@ class JoopRecipe(BasicNewsRecipe):
|
|||||||
keep_only_tags.append(
|
keep_only_tags.append(
|
||||||
dict(name='h2', attrs={'class': 'columnhead smallline'}))
|
dict(name='h2', attrs={'class': 'columnhead smallline'}))
|
||||||
keep_only_tags.append(
|
keep_only_tags.append(
|
||||||
dict(name='div', attrs={'class': re.compile('article.*')}))
|
dict(name='div', attrs={'class': re.compile(r'article.*')}))
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
|
body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
|
||||||
|
@ -44,16 +44,16 @@ class Kurier(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='article', attrs={'class': re.compile('main-article')})
|
dict(name='article', attrs={'class': re.compile(r'main-article')})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div', attrs={'class': 'social-media-container'}),
|
dict(name='div', attrs={'class': 'social-media-container'}),
|
||||||
dict(name='section', attrs={'class': 'tags'}),
|
dict(name='section', attrs={'class': 'tags'}),
|
||||||
dict(name='section', attrs={'class': re.compile('comment-box')}),
|
dict(name='section', attrs={'class': re.compile(r'comment-box')}),
|
||||||
dict(name='section', attrs={'class': re.compile('related-content')}),
|
dict(name='section', attrs={'class': re.compile(r'related-content')}),
|
||||||
dict(name='section', attrs={'class': re.compile('article-slider')}),
|
dict(name='section', attrs={'class': re.compile(r'article-slider')}),
|
||||||
dict(name='section', attrs={'class': re.compile('commentcontainer')}),
|
dict(name='section', attrs={'class': re.compile(r'commentcontainer')}),
|
||||||
dict(name='blockquote')
|
dict(name='blockquote')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ class Kyungyhang(BasicNewsRecipe):
|
|||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile("<div class='ad_movFocus'.*</html>",
|
(re.compile(r"<div class='ad_movFocus'.*</html>",
|
||||||
re.DOTALL | re.IGNORECASE), lambda match: '</html>'),
|
re.DOTALL | re.IGNORECASE), lambda match: '</html>'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -121,7 +121,7 @@ class LeMondeAbonne(BasicNewsRecipe):
|
|||||||
files = os.listdir(path)
|
files = os.listdir(path)
|
||||||
|
|
||||||
nb_index_files = len([
|
nb_index_files = len([
|
||||||
name for name in files if re.match('frame_gauche_[0-9]+.html', name)
|
name for name in files if re.match(r'frame_gauche_[0-9]+.html', name)
|
||||||
])
|
])
|
||||||
|
|
||||||
flux = []
|
flux = []
|
||||||
|
@ -144,7 +144,7 @@ class WeeklyLWN(BasicNewsRecipe):
|
|||||||
# Most articles have anchors in their titles, *except* the
|
# Most articles have anchors in their titles, *except* the
|
||||||
# security vulnerabilities
|
# security vulnerabilities
|
||||||
article_anchor = curr.find(
|
article_anchor = curr.find(
|
||||||
name='a', attrs={'href': re.compile('^/Articles/')})
|
name='a', attrs={'href': re.compile(r'^/Articles/')})
|
||||||
|
|
||||||
if article_anchor:
|
if article_anchor:
|
||||||
article_url = article_anchor.get('href')
|
article_url = article_anchor.get('href')
|
||||||
|
@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||||
|
|
||||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||||
@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
name='div', attrs={'class': 'copyright'}),
|
name='div', attrs={'class': 'copyright'}),
|
||||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||||
dict(name='div', attrs={'id': 'soundoff'}),
|
dict(name='div', attrs={'id': 'soundoff'}),
|
||||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
@ -71,21 +71,21 @@ class Newsweek(BasicNewsRecipe):
|
|||||||
strong = p.find('strong')
|
strong = p.find('strong')
|
||||||
if strong:
|
if strong:
|
||||||
newest = re.compile(
|
newest = re.compile(
|
||||||
'Tekst pochodzi z najnowszego numeru Tygodnika Newsweek')
|
r'Tekst pochodzi z najnowszego numeru Tygodnika Newsweek')
|
||||||
if newest.search(str(strong)):
|
if newest.search(str(strong)):
|
||||||
strong.extract()
|
strong.extract()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
itunes = p.find('a')
|
itunes = p.find('a')
|
||||||
if itunes:
|
if itunes:
|
||||||
reurl = re.compile('itunes.apple.com')
|
reurl = re.compile(r'itunes.apple.com')
|
||||||
if reurl.search(str(itunes['href'])):
|
if reurl.search(str(itunes['href'])):
|
||||||
p.extract()
|
p.extract()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
imagedesc = p.find('div', attrs={'class': 'image-desc'})
|
imagedesc = p.find('div', attrs={'class': 'image-desc'})
|
||||||
if imagedesc:
|
if imagedesc:
|
||||||
redesc = re.compile('Okładka numeru')
|
redesc = re.compile(r'Okładka numeru')
|
||||||
if (redesc.search(str(imagedesc))):
|
if (redesc.search(str(imagedesc))):
|
||||||
p.extract()
|
p.extract()
|
||||||
continue
|
continue
|
||||||
|
@ -77,10 +77,10 @@ class NikkeiNet_paper_subscription(BasicNewsRecipe):
|
|||||||
print('-------------------------get index of paper--------------------------------')
|
print('-------------------------get index of paper--------------------------------')
|
||||||
result = []
|
result = []
|
||||||
soup = self.index_to_soup('http://www.nikkei.com/paper/')
|
soup = self.index_to_soup('http://www.nikkei.com/paper/')
|
||||||
sections = soup.findAll(attrs={'class': re.compile('.*cmn-article_title.*')})
|
sections = soup.findAll(attrs={'class': re.compile(r'.*cmn-article_title.*')})
|
||||||
|
|
||||||
for sect in sections:
|
for sect in sections:
|
||||||
sect_title = sect.find(attrs={'class' : re.compile('.*cmnc-((large)|(middle)|(small)).*')})
|
sect_title = sect.find(attrs={'class' : re.compile(r'.*cmnc-((large)|(middle)|(small)).*')})
|
||||||
if sect_title is None:
|
if sect_title is None:
|
||||||
continue
|
continue
|
||||||
sect_title = sect_title.contents[0]
|
sect_title = sect_title.contents[0]
|
||||||
|
@ -62,7 +62,7 @@ class NRCNext(BasicNewsRecipe):
|
|||||||
zfile = zipfile.ZipFile(BytesIO(epubraw), 'r')
|
zfile = zipfile.ZipFile(BytesIO(epubraw), 'r')
|
||||||
zfile.extractall(self.output_dir)
|
zfile.extractall(self.output_dir)
|
||||||
namelist = zfile.namelist()
|
namelist = zfile.namelist()
|
||||||
emre = re.compile('<em(?:.*)>(.*)</em>')
|
emre = re.compile(r'<em(?:.*)>(.*)</em>')
|
||||||
subst = '\\1'
|
subst = '\\1'
|
||||||
for name in namelist:
|
for name in namelist:
|
||||||
_, ext = os.path.splitext(name)
|
_, ext = os.path.splitext(name)
|
||||||
|
@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||||
|
|
||||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||||
@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
name='div', attrs={'class': 'copyright'}),
|
name='div', attrs={'class': 'copyright'}),
|
||||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||||
dict(name='div', attrs={'id': 'soundoff'}),
|
dict(name='div', attrs={'id': 'soundoff'}),
|
||||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
@ -48,7 +48,7 @@ class outlook(BasicNewsRecipe):
|
|||||||
return [('Articles', ans)]
|
return [('Articles', ans)]
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, *a):
|
def preprocess_raw_html(self, raw, *a):
|
||||||
m = re.search('id="__NEXT_DATA__" type="application/json">', raw)
|
m = re.search(r'id="__NEXT_DATA__" type="application/json">', raw)
|
||||||
raw = raw[m.start():]
|
raw = raw[m.start():]
|
||||||
raw = raw.split('>', 1)[1]
|
raw = raw.split('>', 1)[1]
|
||||||
data = json.JSONDecoder().raw_decode(raw)[0]
|
data = json.JSONDecoder().raw_decode(raw)[0]
|
||||||
|
@ -41,9 +41,9 @@ class Polter(BasicNewsRecipe):
|
|||||||
(u'Konwenty', 'http://konwenty.polter.pl/wiesci,rss.html')]
|
(u'Konwenty', 'http://konwenty.polter.pl/wiesci,rss.html')]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for s in soup.findAll(attrs={'style': re.compile('float: ?left')}):
|
for s in soup.findAll(attrs={'style': re.compile(r'float: ?left')}):
|
||||||
s['class'] = 'floatleft'
|
s['class'] = 'floatleft'
|
||||||
for s in soup.findAll(attrs={'style': re.compile('float: ?right')}):
|
for s in soup.findAll(attrs={'style': re.compile(r'float: ?right')}):
|
||||||
s['class'] = 'floatright'
|
s['class'] = 'floatright'
|
||||||
for s in soup.findAll(style=True):
|
for s in soup.findAll(style=True):
|
||||||
if 'bold;' in s['style']:
|
if 'bold;' in s['style']:
|
||||||
|
@ -161,9 +161,9 @@ class PrivateEyeRecipe(BasicNewsRecipe):
|
|||||||
{'name': 'div', 'attrs': {'id': 'about-covers'}},
|
{'name': 'div', 'attrs': {'id': 'about-covers'}},
|
||||||
{'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}},
|
{'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}},
|
||||||
{'name': 'iframe'},
|
{'name': 'iframe'},
|
||||||
{'name': 'link', 'attrs': {'href': re.compile('/javastyle/lightbox/')}},
|
{'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/lightbox/')}},
|
||||||
{'name': 'link', 'attrs': {'href': re.compile('/javastyle/news_ticker/')}},
|
{'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/news_ticker/')}},
|
||||||
{'name': 'link', 'attrs': {'href': re.compile('/javastyle/media-queries-')}},
|
{'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/media-queries-')}},
|
||||||
]
|
]
|
||||||
|
|
||||||
# Convert headers to h1, strapline to h4
|
# Convert headers to h1, strapline to h4
|
||||||
|
@ -54,7 +54,7 @@ class ScienceNewsIssue(BasicNewsRecipe):
|
|||||||
# Get articles
|
# Get articles
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
soup = soup.find('main', attrs={'id':'content'})
|
soup = soup.find('main', attrs={'id':'content'})
|
||||||
re_article = re.compile('https://www.sciencenews.org/article/')
|
re_article = re.compile(r'https://www.sciencenews.org/article/')
|
||||||
stories = []
|
stories = []
|
||||||
past_urls = set()
|
past_urls = set()
|
||||||
for sec in soup.find_all(href=re_article):
|
for sec in soup.find_all(href=re_article):
|
||||||
|
@ -76,8 +76,8 @@ class SolHaberRecipe(BasicNewsRecipe):
|
|||||||
result = []
|
result = []
|
||||||
articles_dict = {}
|
articles_dict = {}
|
||||||
|
|
||||||
author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$')
|
author_regexp = re.compile(r'^http://.*?/yazarlar/(.*?)/.*$')
|
||||||
category_regexp = re.compile('^http://.*?/(.+?)/.*$')
|
category_regexp = re.compile(r'^http://.*?/(.+?)/.*$')
|
||||||
|
|
||||||
for section_tuple in self.section_tuples:
|
for section_tuple in self.section_tuples:
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ class StandardMediaKeRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
import re
|
import re
|
||||||
p = re.compile('http://www.standardmedia.co.ke/.*InsidePage.php')
|
p = re.compile(r'http://www.standardmedia.co.ke/.*InsidePage.php')
|
||||||
return p.sub('http://www.standardmedia.co.ke/print.php', url)
|
return p.sub('http://www.standardmedia.co.ke/print.php', url)
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
@ -89,7 +89,7 @@ class TheAge(BasicNewsRecipe):
|
|||||||
|
|
||||||
for i in soup.findAll('a'):
|
for i in soup.findAll('a'):
|
||||||
href = i['href']
|
href = i['href']
|
||||||
if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf', href):
|
if href and re.match(r'http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf', href):
|
||||||
return href
|
return href
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
@ -92,7 +92,7 @@ class PrivateEyeRecipe(BasicNewsRecipe):
|
|||||||
# 1. Title. By author
|
# 1. Title. By author
|
||||||
#.2. Title by author: subtitle
|
#.2. Title by author: subtitle
|
||||||
# 3. Title: author: subtitle
|
# 3. Title: author: subtitle
|
||||||
title_author_re = re.compile('^(.*?)(?:(?: by )|(?:: ))(.*?): (.*?)$')
|
title_author_re = re.compile(r'^(.*?)(?:(?: by )|(?:: ))(.*?): (.*?)$')
|
||||||
|
|
||||||
# Separate author from title (where it is specified)
|
# Separate author from title (where it is specified)
|
||||||
def title_author(self, head):
|
def title_author(self, head):
|
||||||
|
@ -38,7 +38,7 @@ class Tweakers(BasicNewsRecipe):
|
|||||||
'class': ['sidebar', 'advertorial']
|
'class': ['sidebar', 'advertorial']
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
'class': re.compile('nextPrevious')
|
'class': re.compile(r'nextPrevious')
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
@ -126,7 +126,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||||
|
|
||||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||||
@ -140,7 +140,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
name='div', attrs={'class': 'copyright'}),
|
name='div', attrs={'class': 'copyright'}),
|
||||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||||
dict(name='div', attrs={'id': 'soundoff'}),
|
dict(name='div', attrs={'id': 'soundoff'}),
|
||||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
@ -127,7 +127,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
keep_only_tags = [dict(name='div', attrs={'id': re.compile(r'story')})]
|
||||||
|
|
||||||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||||||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||||||
@ -141,7 +141,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
name='div', attrs={'class': 'copyright'}),
|
name='div', attrs={'class': 'copyright'}),
|
||||||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||||||
dict(name='div', attrs={'id': 'soundoff'}),
|
dict(name='div', attrs={'id': 'soundoff'}),
|
||||||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
dict(name='div', attrs={'id': re.compile(r'flyer')}),
|
||||||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
@ -82,28 +82,28 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
'''
|
'''
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'class': re.compile('main.content')})]
|
dict(name='div', attrs={'class': re.compile(r'main.content')})]
|
||||||
|
|
||||||
def __init__(self, options, log, progress_reporter):
|
def __init__(self, options, log, progress_reporter):
|
||||||
self.remove_tags = [{'class': 'comments'},
|
self.remove_tags = [{'class': 'comments'},
|
||||||
{'id': 'photocredit'},
|
{'id': 'photocredit'},
|
||||||
dict(name='div', attrs={
|
dict(name='div', attrs={
|
||||||
'class': re.compile('top.controls')}),
|
'class': re.compile(r'top.controls')}),
|
||||||
dict(name='div', attrs={
|
dict(name='div', attrs={
|
||||||
'class': re.compile('^comments')}),
|
'class': re.compile(r'^comments')}),
|
||||||
dict(name='div', attrs={
|
dict(name='div', attrs={
|
||||||
'class': re.compile('social')}),
|
'class': re.compile(r'social')}),
|
||||||
dict(name='div', attrs={
|
dict(name='div', attrs={
|
||||||
'class': re.compile('tools')}),
|
'class': re.compile(r'tools')}),
|
||||||
dict(name='div', attrs={
|
dict(name='div', attrs={
|
||||||
'class': re.compile('bottom.tools')}),
|
'class': re.compile(r'bottom.tools')}),
|
||||||
dict(name='div', attrs={
|
dict(name='div', attrs={
|
||||||
'class': re.compile('window')}),
|
'class': re.compile(r'window')}),
|
||||||
dict(name='div', attrs={'class': re.compile('related.news.element')})]
|
dict(name='div', attrs={'class': re.compile(r'related.news.element')})]
|
||||||
print('PROFILE NAME = ' + options.output_profile.short_name)
|
print('PROFILE NAME = ' + options.output_profile.short_name)
|
||||||
if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
|
if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
|
||||||
self.remove_tags.append(
|
self.remove_tags.append(
|
||||||
dict(name='div', attrs={'class': re.compile('image-container')}))
|
dict(name='div', attrs={'class': re.compile(r'image-container')}))
|
||||||
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
|
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
@ -173,19 +173,19 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
byline = soup.find('p', attrs={'class': re.compile('ancillary')})
|
byline = soup.find('p', attrs={'class': re.compile(r'ancillary')})
|
||||||
if byline is not None:
|
if byline is not None:
|
||||||
authstr = self.tag_to_string(byline, False)
|
authstr = self.tag_to_string(byline, False)
|
||||||
authstr = re.sub('/ *Times Colonist', '/',
|
authstr = re.sub(r'/ *Times Colonist', '/',
|
||||||
authstr, flags=re.IGNORECASE)
|
authstr, flags=re.IGNORECASE)
|
||||||
authstr = re.sub('BY */', '', authstr, flags=re.IGNORECASE)
|
authstr = re.sub(r'BY */', '', authstr, flags=re.IGNORECASE)
|
||||||
newdiv = new_tag(soup, 'div')
|
newdiv = new_tag(soup, 'div')
|
||||||
newdiv.insert(0, authstr)
|
newdiv.insert(0, authstr)
|
||||||
newdiv['class'] = 'byline'
|
newdiv['class'] = 'byline'
|
||||||
byline.replaceWith(newdiv)
|
byline.replaceWith(newdiv)
|
||||||
for caption in soup.findAll('p', attrs={'class': re.compile('caption')}):
|
for caption in soup.findAll('p', attrs={'class': re.compile(r'caption')}):
|
||||||
capstr = self.tag_to_string(caption, False)
|
capstr = self.tag_to_string(caption, False)
|
||||||
capstr = re.sub('Photograph by.*$', '',
|
capstr = re.sub(r'Photograph by.*$', '',
|
||||||
capstr, flags=re.IGNORECASE)
|
capstr, flags=re.IGNORECASE)
|
||||||
newdiv = new_tag(soup, 'div')
|
newdiv = new_tag(soup, 'div')
|
||||||
newdiv.insert(0, capstr)
|
newdiv.insert(0, capstr)
|
||||||
@ -239,13 +239,13 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
except:
|
except:
|
||||||
return ans
|
return ans
|
||||||
mainsoup = soup.find(
|
mainsoup = soup.find(
|
||||||
'div', attrs={'class': re.compile('main.content')})
|
'div', attrs={'class': re.compile(r'main.content')})
|
||||||
article_list = []
|
article_list = []
|
||||||
for wdiv in mainsoup.findAll('div', attrs={'id': re.compile('featured.story')}):
|
for wdiv in mainsoup.findAll('div', attrs={'id': re.compile(r'featured.story')}):
|
||||||
for htag in wdiv.findAll('h3'):
|
for htag in wdiv.findAll('h3'):
|
||||||
self.handle_articles(htag, article_list, sectitle)
|
self.handle_articles(htag, article_list, sectitle)
|
||||||
for ladiv in mainsoup.findAll(attrs={'class': re.compile('leading.articles')}):
|
for ladiv in mainsoup.findAll(attrs={'class': re.compile(r'leading.articles')}):
|
||||||
for wdiv in mainsoup.findAll('div', attrs={'class': re.compile('article.row')}):
|
for wdiv in mainsoup.findAll('div', attrs={'class': re.compile(r'article.row')}):
|
||||||
for htag in wdiv.findAll('h2'):
|
for htag in wdiv.findAll('h2'):
|
||||||
self.handle_articles(htag, article_list, sectitle)
|
self.handle_articles(htag, article_list, sectitle)
|
||||||
ans.append((sectitle, article_list))
|
ans.append((sectitle, article_list))
|
||||||
|
@ -139,7 +139,7 @@ class ZeitDe(BasicNewsRecipe):
|
|||||||
body.insert(0, header)
|
body.insert(0, header)
|
||||||
|
|
||||||
# Add real img tags for images
|
# Add real img tags for images
|
||||||
for container in soup.findAll(class_=re.compile('__media-container$')):
|
for container in soup.findAll(class_=re.compile(r'__media-container$')):
|
||||||
img = container.find('noscript')
|
img = container.find('noscript')
|
||||||
if img is not None:
|
if img is not None:
|
||||||
img.name = 'div'
|
img.name = 'div'
|
||||||
|
@ -200,11 +200,11 @@ class ZeitEPUBAbo(BasicNewsRecipe):
|
|||||||
# browser.follow_link(abolink)
|
# browser.follow_link(abolink)
|
||||||
# find page for latest issue
|
# find page for latest issue
|
||||||
latestlink = browser.find_link(text_regex=re.compile(
|
latestlink = browser.find_link(text_regex=re.compile(
|
||||||
'.*ZUR AKTUELLEN AUSGABE.*'))
|
r'.*ZUR AKTUELLEN AUSGABE.*'))
|
||||||
browser.follow_link(latestlink)
|
browser.follow_link(latestlink)
|
||||||
# now find the correct file, we will still use the ePub file
|
# now find the correct file, we will still use the ePub file
|
||||||
epublink = browser.find_link(text_regex=re.compile(
|
epublink = browser.find_link(text_regex=re.compile(
|
||||||
'.*EPUB F.*R E-READER LADEN.*')) # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017
|
r'.*EPUB F.*R E-READER LADEN.*')) # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017
|
||||||
response = browser.follow_link(epublink)
|
response = browser.follow_link(epublink)
|
||||||
self.report_progress(1, _('next step'))
|
self.report_progress(1, _('next step'))
|
||||||
|
|
||||||
@ -266,11 +266,11 @@ class ZeitEPUBAbo(BasicNewsRecipe):
|
|||||||
# browser.follow_link(abolink)
|
# browser.follow_link(abolink)
|
||||||
# find page for latest issue
|
# find page for latest issue
|
||||||
latestlink = browser.find_link(text_regex=re.compile(
|
latestlink = browser.find_link(text_regex=re.compile(
|
||||||
'.*ZUR AKTUELLEN AUSGABE.*'))
|
r'.*ZUR AKTUELLEN AUSGABE.*'))
|
||||||
browser.follow_link(latestlink)
|
browser.follow_link(latestlink)
|
||||||
# actual cover search
|
# actual cover search
|
||||||
pdflink = browser.find_link(text_regex=re.compile(
|
pdflink = browser.find_link(text_regex=re.compile(
|
||||||
'.*GESAMT-PDF LADEN.*'))
|
r'.*GESAMT-PDF LADEN.*'))
|
||||||
cover_url = urlparse(pdflink.base_url)[0] + '://' + urlparse(pdflink.base_url)[1] + '' + (
|
cover_url = urlparse(pdflink.base_url)[0] + '://' + urlparse(pdflink.base_url)[1] + '' + (
|
||||||
urlparse(pdflink.url)[2]).replace('ePaper_', '').replace('.pdf', '_001.pdf')
|
urlparse(pdflink.url)[2]).replace('ePaper_', '').replace('.pdf', '_001.pdf')
|
||||||
self.log.warning('PDF link found:')
|
self.log.warning('PDF link found:')
|
||||||
|
@ -34,6 +34,7 @@ select = [
|
|||||||
# preview rules
|
# preview rules
|
||||||
'RUF051', 'RUF056', # useless dict operation
|
'RUF051', 'RUF056', # useless dict operation
|
||||||
'RUF055', # unnecessary regex
|
'RUF055', # unnecessary regex
|
||||||
|
'RUF039', # always use raw-string for regex
|
||||||
]
|
]
|
||||||
|
|
||||||
[lint.per-file-ignores]
|
[lint.per-file-ignores]
|
||||||
@ -46,7 +47,7 @@ select = [
|
|||||||
"src/calibre/gui2/store/stores/*" = ['UP']
|
"src/calibre/gui2/store/stores/*" = ['UP']
|
||||||
"src/calibre/gui2/tts/manager.py" = ['UP037']
|
"src/calibre/gui2/tts/manager.py" = ['UP037']
|
||||||
"src/calibre/utils/copy_files.py" = ['UP037']
|
"src/calibre/utils/copy_files.py" = ['UP037']
|
||||||
"src/calibre/utils/smartypants.py" = ['RUF055']
|
"src/calibre/utils/smartypants.py" = ['RUF039', 'RUF055']
|
||||||
"src/qt/*.py" = ['I']
|
"src/qt/*.py" = ['I']
|
||||||
"src/qt/*.pyi" = ['I']
|
"src/qt/*.pyi" = ['I']
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ import time
|
|||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
|
||||||
iswindows = re.search('win(32|64)', sys.platform)
|
iswindows = re.search(r'win(32|64)', sys.platform)
|
||||||
ismacos = 'darwin' in sys.platform
|
ismacos = 'darwin' in sys.platform
|
||||||
isfreebsd = 'freebsd' in sys.platform
|
isfreebsd = 'freebsd' in sys.platform
|
||||||
isnetbsd = 'netbsd' in sys.platform
|
isnetbsd = 'netbsd' in sys.platform
|
||||||
|
@ -657,7 +657,7 @@ class Parser(SearchQueryParser): # {{{
|
|||||||
|
|
||||||
if location == 'template':
|
if location == 'template':
|
||||||
try:
|
try:
|
||||||
template, sep, query = regex.split('#@#:([tdnb]):', query, flags=regex.IGNORECASE)
|
template, sep, query = regex.split(r'#@#:([tdnb]):', query, flags=regex.IGNORECASE)
|
||||||
if sep:
|
if sep:
|
||||||
sep = sep.lower()
|
sep = sep.lower()
|
||||||
else:
|
else:
|
||||||
|
@ -34,7 +34,7 @@ class CYBOOK(USBMS):
|
|||||||
|
|
||||||
VENDOR_NAME = 'BOOKEEN'
|
VENDOR_NAME = 'BOOKEEN'
|
||||||
WINDOWS_MAIN_MEM = re.compile(r'CYBOOK_(OPUS|GEN3)__-FD')
|
WINDOWS_MAIN_MEM = re.compile(r'CYBOOK_(OPUS|GEN3)__-FD')
|
||||||
WINDOWS_CARD_A_MEM = re.compile('CYBOOK_(OPUS|GEN3)__-SD')
|
WINDOWS_CARD_A_MEM = re.compile(r'CYBOOK_(OPUS|GEN3)__-SD')
|
||||||
OSX_MAIN_MEM_VOL_PAT = re.compile(r'/Cybook')
|
OSX_MAIN_MEM_VOL_PAT = re.compile(r'/Cybook')
|
||||||
|
|
||||||
EBOOK_DIR_MAIN = 'eBooks'
|
EBOOK_DIR_MAIN = 'eBooks'
|
||||||
@ -72,7 +72,7 @@ class ORIZON(CYBOOK):
|
|||||||
|
|
||||||
VENDOR_NAME = ['BOOKEEN', 'LINUX']
|
VENDOR_NAME = ['BOOKEEN', 'LINUX']
|
||||||
WINDOWS_MAIN_MEM = re.compile(r'(CYBOOK_ORIZON__-FD)|(FILE-STOR_GADGET)')
|
WINDOWS_MAIN_MEM = re.compile(r'(CYBOOK_ORIZON__-FD)|(FILE-STOR_GADGET)')
|
||||||
WINDOWS_CARD_A_MEM = re.compile('(CYBOOK_ORIZON__-SD)|(FILE-STOR_GADGET)')
|
WINDOWS_CARD_A_MEM = re.compile(r'(CYBOOK_ORIZON__-SD)|(FILE-STOR_GADGET)')
|
||||||
|
|
||||||
EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'Digital Editions'
|
EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'Digital Editions'
|
||||||
|
|
||||||
|
@ -58,11 +58,11 @@ def build_template_regexp(template):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
template = template.rpartition('/')[2]
|
template = template.rpartition('/')[2]
|
||||||
return re.compile(re.sub('{([^}]*)}', f, template) + r'([_\d]*$)')
|
return re.compile(re.sub(r'{([^}]*)}', f, template) + r'([_\d]*$)')
|
||||||
except:
|
except:
|
||||||
prints('Failed to parse template: %r'%template)
|
prints('Failed to parse template: %r'%template)
|
||||||
template = '{title} - {authors}'
|
template = '{title} - {authors}'
|
||||||
return re.compile(re.sub('{([^}]*)}', f, template) + r'([_\d]*$)')
|
return re.compile(re.sub(r'{([^}]*)}', f, template) + r'([_\d]*$)')
|
||||||
|
|
||||||
|
|
||||||
def create_upload_path(mdata, fname, template, sanitize,
|
def create_upload_path(mdata, fname, template, sanitize,
|
||||||
|
@ -239,7 +239,7 @@ def generate_masthead(title, output_path=None, width=600, height=60):
|
|||||||
def escape_xpath_attr(value):
|
def escape_xpath_attr(value):
|
||||||
if '"' in value:
|
if '"' in value:
|
||||||
if "'" in value:
|
if "'" in value:
|
||||||
parts = re.split('("+)', value)
|
parts = re.split(r'("+)', value)
|
||||||
ans = []
|
ans = []
|
||||||
for x in parts:
|
for x in parts:
|
||||||
if x:
|
if x:
|
||||||
|
@ -42,7 +42,7 @@ def _metadata_from_table(soup, searchfor):
|
|||||||
# on the home page. cue some nasty special-case hacks...
|
# on the home page. cue some nasty special-case hacks...
|
||||||
if re.match(r'^\s*'+searchfor+r'\s*$', td.decode_contents(), flags=re.I):
|
if re.match(r'^\s*'+searchfor+r'\s*$', td.decode_contents(), flags=re.I):
|
||||||
meta = _detag(td.findNextSibling('td'))
|
meta = _detag(td.findNextSibling('td'))
|
||||||
return re.sub('^:', '', meta).strip()
|
return re.sub(r'^:', '', meta).strip()
|
||||||
else:
|
else:
|
||||||
meta = _detag(td)
|
meta = _detag(td)
|
||||||
return re.sub(r'^[^:]+:', '', meta).strip()
|
return re.sub(r'^[^:]+:', '', meta).strip()
|
||||||
@ -89,7 +89,7 @@ def _get_comments(soup):
|
|||||||
def _get_cover(soup, rdr):
|
def _get_cover(soup, rdr):
|
||||||
ans = None
|
ans = None
|
||||||
try:
|
try:
|
||||||
ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
|
ans = soup.find('img', alt=re.compile(r'cover', flags=re.I))['src']
|
||||||
except TypeError:
|
except TypeError:
|
||||||
# meeehh, no handy alt-tag goodness, try some hackery
|
# meeehh, no handy alt-tag goodness, try some hackery
|
||||||
# the basic idea behind this is that in general, the cover image
|
# the basic idea behind this is that in general, the cover image
|
||||||
|
@ -16,7 +16,7 @@ XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
|||||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||||
|
|
||||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
_span_pat = re.compile(r'<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||||
|
|
||||||
LIGATURES = {
|
LIGATURES = {
|
||||||
# 'Æ': 'AE',
|
# 'Æ': 'AE',
|
||||||
@ -92,7 +92,7 @@ class DocAnalysis:
|
|||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||||
elif format == 'spanned_html':
|
elif format == 'spanned_html':
|
||||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
linere = re.compile(r'(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||||
elif format == 'txt':
|
elif format == 'txt':
|
||||||
linere = re.compile('.*?\n')
|
linere = re.compile('.*?\n')
|
||||||
self.lines = linere.findall(raw)
|
self.lines = linere.findall(raw)
|
||||||
@ -430,16 +430,16 @@ def book_designer_rules():
|
|||||||
if ans is None:
|
if ans is None:
|
||||||
ans = book_designer_rules.ans = [
|
ans = book_designer_rules.ans = [
|
||||||
# HR
|
# HR
|
||||||
(re.compile('<hr>', re.IGNORECASE),
|
(re.compile(r'<hr>', re.IGNORECASE),
|
||||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||||
# Create header tags
|
# Create header tags
|
||||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
(re.compile(r'<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
||||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
(re.compile(r'<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||||
]
|
]
|
||||||
return ans
|
return ans
|
||||||
@ -458,7 +458,7 @@ class HTMLPreProcessor:
|
|||||||
re.IGNORECASE).search(src) is not None
|
re.IGNORECASE).search(src) is not None
|
||||||
|
|
||||||
def is_book_designer(self, raw):
|
def is_book_designer(self, raw):
|
||||||
return re.search('<H2[^><]*id=BookTitle', raw) is not None
|
return re.search(r'<H2[^><]*id=BookTitle', raw) is not None
|
||||||
|
|
||||||
def is_pdftohtml(self, src):
|
def is_pdftohtml(self, src):
|
||||||
return "<!-- created by calibre's pdftohtml -->" in src[:1000]
|
return "<!-- created by calibre's pdftohtml -->" in src[:1000]
|
||||||
|
@ -27,7 +27,7 @@ class HeuristicProcessor:
|
|||||||
self.chapters_with_title = 0
|
self.chapters_with_title = 0
|
||||||
self.blanks_deleted = False
|
self.blanks_deleted = False
|
||||||
self.blanks_between_paragraphs = False
|
self.blanks_between_paragraphs = False
|
||||||
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
self.linereg = re.compile(r'(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
|
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
|
||||||
@ -108,7 +108,7 @@ class HeuristicProcessor:
|
|||||||
inspect. Percent is the minimum percent of line endings which should
|
inspect. Percent is the minimum percent of line endings which should
|
||||||
be marked up to return true.
|
be marked up to return true.
|
||||||
'''
|
'''
|
||||||
htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
|
htm_end_ere = re.compile(r'</(p|div)>', re.DOTALL)
|
||||||
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
|
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
|
||||||
htm_end = htm_end_ere.findall(raw)
|
htm_end = htm_end_ere.findall(raw)
|
||||||
line_end = line_end_ere.findall(raw)
|
line_end = line_end_ere.findall(raw)
|
||||||
@ -209,7 +209,7 @@ class HeuristicProcessor:
|
|||||||
typical_chapters = 15000.
|
typical_chapters = 15000.
|
||||||
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||||
self.log.debug('minimum chapters required are: '+str(self.min_chapters))
|
self.log.debug('minimum chapters required are: '+str(self.min_chapters))
|
||||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
heading = re.compile(r'<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
self.html_preprocess_sections = len(heading.findall(html))
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
self.log.debug('found ' + str(self.html_preprocess_sections) + ' pre-existing headings')
|
self.log.debug('found ' + str(self.html_preprocess_sections) + ' pre-existing headings')
|
||||||
|
|
||||||
@ -299,7 +299,7 @@ class HeuristicProcessor:
|
|||||||
break
|
break
|
||||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||||
if n_lookahead_req:
|
if n_lookahead_req:
|
||||||
n_lookahead = re.sub('(ou|in|cha)', 'lookahead_', full_chapter_line)
|
n_lookahead = re.sub(r'(ou|in|cha)', 'lookahead_', full_chapter_line)
|
||||||
if not analyze:
|
if not analyze:
|
||||||
self.log.debug('Marked ' + str(self.html_preprocess_sections) + ' headings, ' + log_message)
|
self.log.debug('Marked ' + str(self.html_preprocess_sections) + ' headings, ' + log_message)
|
||||||
|
|
||||||
@ -442,7 +442,7 @@ class HeuristicProcessor:
|
|||||||
# Delete microsoft 'smart' tags
|
# Delete microsoft 'smart' tags
|
||||||
html = re.sub('(?i)</?st1:\\w+>', '', html)
|
html = re.sub('(?i)</?st1:\\w+>', '', html)
|
||||||
# Re-open self closing paragraph tags
|
# Re-open self closing paragraph tags
|
||||||
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
|
html = re.sub(r'<p[^>/]*/>', '<p> </p>', html)
|
||||||
# Get rid of empty span, bold, font, em, & italics tags
|
# Get rid of empty span, bold, font, em, & italics tags
|
||||||
fmt_tags = 'font|[ibu]|em|strong'
|
fmt_tags = 'font|[ibu]|em|strong'
|
||||||
open_fmt_pat, close_fmt_pat = fr'<(?:{fmt_tags})(?:\s[^>]*)?>', f'</(?:{fmt_tags})>'
|
open_fmt_pat, close_fmt_pat = fr'<(?:{fmt_tags})(?:\s[^>]*)?>', f'</(?:{fmt_tags})>'
|
||||||
@ -462,8 +462,8 @@ class HeuristicProcessor:
|
|||||||
determines the type of html line ending used most commonly in a document
|
determines the type of html line ending used most commonly in a document
|
||||||
use before calling docanalysis functions
|
use before calling docanalysis functions
|
||||||
'''
|
'''
|
||||||
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
paras_reg = re.compile(r'<p[^>]*>', re.IGNORECASE)
|
||||||
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
spans_reg = re.compile(r'<span[^>]*>', re.IGNORECASE)
|
||||||
paras = len(paras_reg.findall(html))
|
paras = len(paras_reg.findall(html))
|
||||||
spans = len(spans_reg.findall(html))
|
spans = len(spans_reg.findall(html))
|
||||||
if spans > 1:
|
if spans > 1:
|
||||||
@ -557,8 +557,8 @@ class HeuristicProcessor:
|
|||||||
|
|
||||||
def detect_soft_breaks(self, html):
|
def detect_soft_breaks(self, html):
|
||||||
line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')'
|
line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')'
|
||||||
line_two = '(?P<line_two>'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+ \
|
line_two = '(?P<line_two>'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_open)+ \
|
||||||
'\\s*(?P<line_two_content>.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')'
|
'\\s*(?P<line_two_content>.*?)'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_close)+')'
|
||||||
div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two
|
div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two
|
||||||
div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
|
div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
|
||||||
|
|
||||||
@ -596,8 +596,8 @@ class HeuristicProcessor:
|
|||||||
All other html is converted to text.
|
All other html is converted to text.
|
||||||
'''
|
'''
|
||||||
hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
|
hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
|
||||||
if re.findall('(<|>)', replacement_break):
|
if re.findall(r'(<|>)', replacement_break):
|
||||||
if re.match('^<hr', replacement_break):
|
if re.match(r'^<hr', replacement_break):
|
||||||
if replacement_break.find('width') != -1:
|
if replacement_break.find('width') != -1:
|
||||||
try:
|
try:
|
||||||
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
|
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
|
||||||
@ -608,11 +608,11 @@ class HeuristicProcessor:
|
|||||||
else:
|
else:
|
||||||
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
|
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
|
||||||
divpercent = (100 - width) // 2
|
divpercent = (100 - width) // 2
|
||||||
hr_open = re.sub('45', str(divpercent), hr_open)
|
hr_open = re.sub(r'45', str(divpercent), hr_open)
|
||||||
scene_break = hr_open+replacement_break+'</div>'
|
scene_break = hr_open+replacement_break+'</div>'
|
||||||
else:
|
else:
|
||||||
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||||
elif re.match('^<img', replacement_break):
|
elif re.match(r'^<img', replacement_break):
|
||||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||||
else:
|
else:
|
||||||
from calibre.utils.html2text import html2text
|
from calibre.utils.html2text import html2text
|
||||||
@ -638,7 +638,7 @@ class HeuristicProcessor:
|
|||||||
empty_paragraph = '\n<p> </p>\n'
|
empty_paragraph = '\n<p> </p>\n'
|
||||||
self.in_blockquote = False
|
self.in_blockquote = False
|
||||||
self.previous_was_paragraph = False
|
self.previous_was_paragraph = False
|
||||||
html = re.sub('</?a[^>]*>', '', html)
|
html = re.sub(r'</?a[^>]*>', '', html)
|
||||||
|
|
||||||
def convert_styles(match):
|
def convert_styles(match):
|
||||||
# print('raw styles are: '+match.group('styles'))
|
# print('raw styles are: '+match.group('styles'))
|
||||||
|
@ -91,7 +91,7 @@ class HTMLFile:
|
|||||||
|
|
||||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||||
HTML_PAT_BIN = re.compile(br'<\s*html', re.IGNORECASE)
|
HTML_PAT_BIN = re.compile(br'<\s*html', re.IGNORECASE)
|
||||||
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
TITLE_PAT = re.compile(r'<title>([^<>]+)</title>', re.IGNORECASE)
|
||||||
LINK_PAT = re.compile(
|
LINK_PAT = re.compile(
|
||||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||||
re.DOTALL|re.IGNORECASE)
|
re.DOTALL|re.IGNORECASE)
|
||||||
|
@ -269,7 +269,7 @@ class OEB2HTMLInlineCSSizer(OEB2HTML):
|
|||||||
tag = 'div'
|
tag = 'div'
|
||||||
# Add page-break-brefore: always because renders typically treat a new file (we're merging files)
|
# Add page-break-brefore: always because renders typically treat a new file (we're merging files)
|
||||||
# as a page break and remove all other page break types that might be set.
|
# as a page break and remove all other page break types that might be set.
|
||||||
style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a)
|
style_a = 'page-break-before: always; %s' % re.sub(r'page-break-[^:]+:[^;]+;?', '', style_a)
|
||||||
# Remove unnecessary spaces.
|
# Remove unnecessary spaces.
|
||||||
style_a = re.sub(r'\s{2,}', ' ', style_a).strip()
|
style_a = re.sub(r'\s{2,}', ' ', style_a).strip()
|
||||||
tags.append(tag)
|
tags.append(tag)
|
||||||
|
@ -34,8 +34,8 @@ class Hyphenator:
|
|||||||
def _insert_pattern(self, pattern):
|
def _insert_pattern(self, pattern):
|
||||||
# Convert a pattern like 'a1bc3d4' into a string of chars 'abcd'
|
# Convert a pattern like 'a1bc3d4' into a string of chars 'abcd'
|
||||||
# and a list of points [ 1, 0, 3, 4 ].
|
# and a list of points [ 1, 0, 3, 4 ].
|
||||||
chars = re.sub('[0-9]', '', pattern)
|
chars = re.sub(r'[0-9]', '', pattern)
|
||||||
points = [int(d or 0) for d in re.split('[.a-z]', pattern)]
|
points = [int(d or 0) for d in re.split(r'[.a-z]', pattern)]
|
||||||
|
|
||||||
# Insert the pattern into the tree. Each character finds a dict
|
# Insert the pattern into the tree. Each character finds a dict
|
||||||
# another level down in the tree, and leaf nodes have the list of
|
# another level down in the tree, and leaf nodes have the list of
|
||||||
|
@ -163,7 +163,7 @@ class HTMLConverter:
|
|||||||
# Fix Book Designer markup
|
# Fix Book Designer markup
|
||||||
BOOK_DESIGNER = [
|
BOOK_DESIGNER = [
|
||||||
# HR
|
# HR
|
||||||
(re.compile('<hr>', re.IGNORECASE),
|
(re.compile(r'<hr>', re.IGNORECASE),
|
||||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||||
# Create header tags
|
# Create header tags
|
||||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||||
@ -279,7 +279,7 @@ class HTMLConverter:
|
|||||||
if isinstance(src, bytes):
|
if isinstance(src, bytes):
|
||||||
src = src.decode('utf-8', 'replace')
|
src = src.decode('utf-8', 'replace')
|
||||||
match = self.PAGE_BREAK_PAT.search(src)
|
match = self.PAGE_BREAK_PAT.search(src)
|
||||||
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
|
if match and not re.match(r'avoid', match.group(1), re.IGNORECASE):
|
||||||
self.page_break_found = True
|
self.page_break_found = True
|
||||||
ncss, npcss = self.parse_css(src)
|
ncss, npcss = self.parse_css(src)
|
||||||
if ncss:
|
if ncss:
|
||||||
@ -324,10 +324,10 @@ class HTMLConverter:
|
|||||||
|
|
||||||
def is_baen(self, soup):
|
def is_baen(self, soup):
|
||||||
return bool(soup.find('meta', attrs={'name':'Publisher',
|
return bool(soup.find('meta', attrs={'name':'Publisher',
|
||||||
'content':re.compile('Baen', re.IGNORECASE)}))
|
'content':re.compile(r'Baen', re.IGNORECASE)}))
|
||||||
|
|
||||||
def is_book_designer(self, raw):
|
def is_book_designer(self, raw):
|
||||||
return bool(re.search('<H2[^><]*id=BookTitle', raw))
|
return bool(re.search(r'<H2[^><]*id=BookTitle', raw))
|
||||||
|
|
||||||
def preprocess(self, raw):
|
def preprocess(self, raw):
|
||||||
nmassage = []
|
nmassage = []
|
||||||
@ -1152,7 +1152,7 @@ class HTMLConverter:
|
|||||||
|
|
||||||
def font_weight(val):
|
def font_weight(val):
|
||||||
ans = 0
|
ans = 0
|
||||||
m = re.search('([0-9]+)', val)
|
m = re.search(r'([0-9]+)', val)
|
||||||
if m:
|
if m:
|
||||||
ans = int(m.group(1))
|
ans = int(m.group(1))
|
||||||
elif val.find('bold') >= 0 or val.find('strong') >= 0:
|
elif val.find('bold') >= 0 or val.find('strong') >= 0:
|
||||||
@ -1544,7 +1544,7 @@ class HTMLConverter:
|
|||||||
with open(path, 'rb') as f:
|
with open(path, 'rb') as f:
|
||||||
src = f.read().decode('utf-8', 'replace')
|
src = f.read().decode('utf-8', 'replace')
|
||||||
match = self.PAGE_BREAK_PAT.search(src)
|
match = self.PAGE_BREAK_PAT.search(src)
|
||||||
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
|
if match and not re.match(r'avoid', match.group(1), re.IGNORECASE):
|
||||||
self.page_break_found = True
|
self.page_break_found = True
|
||||||
ncss, npcss = self.parse_css(src)
|
ncss, npcss = self.parse_css(src)
|
||||||
except OSError:
|
except OSError:
|
||||||
@ -1869,11 +1869,11 @@ def process_file(path, options, logger):
|
|||||||
header.append(fheader + ' ')
|
header.append(fheader + ' ')
|
||||||
book, fonts = Book(options, logger, header=header, **args)
|
book, fonts = Book(options, logger, header=header, **args)
|
||||||
le = re.compile(options.link_exclude) if options.link_exclude else \
|
le = re.compile(options.link_exclude) if options.link_exclude else \
|
||||||
re.compile('$')
|
re.compile(r'$')
|
||||||
pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
|
pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
|
||||||
re.compile('$')
|
re.compile(r'$')
|
||||||
fpb = re.compile(options.force_page_break, re.IGNORECASE) if options.force_page_break else \
|
fpb = re.compile(options.force_page_break, re.IGNORECASE) if options.force_page_break else \
|
||||||
re.compile('$')
|
re.compile(r'$')
|
||||||
cq = options.chapter_attr.split(',')
|
cq = options.chapter_attr.split(',')
|
||||||
if len(cq) < 3:
|
if len(cq) < 3:
|
||||||
raise ValueError('The --chapter-attr setting must have 2 commas.')
|
raise ValueError('The --chapter-attr setting must have 2 commas.')
|
||||||
|
@ -213,7 +213,7 @@ class Row:
|
|||||||
def __init__(self, conv, row, css, colpad):
|
def __init__(self, conv, row, css, colpad):
|
||||||
self.cells = []
|
self.cells = []
|
||||||
self.colpad = colpad
|
self.colpad = colpad
|
||||||
cells = row.findAll(re.compile('td|th', re.IGNORECASE))
|
cells = row.findAll(re.compile(r'td|th', re.IGNORECASE))
|
||||||
self.targets = []
|
self.targets = []
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
ccss = conv.tag_css(cell, css)[0]
|
ccss = conv.tag_css(cell, css)[0]
|
||||||
|
@ -172,7 +172,7 @@ def get_title_sort_pat(lang=None):
|
|||||||
except:
|
except:
|
||||||
ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE)
|
ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE)
|
||||||
else:
|
else:
|
||||||
ans = re.compile('^$') # matches only the empty string
|
ans = re.compile(r'^$') # matches only the empty string
|
||||||
_title_pats[lang] = ans
|
_title_pats[lang] = ans
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
@ -139,7 +139,7 @@ def metadata_from_filename(name, pat=None, fallback_pat=None):
|
|||||||
try:
|
try:
|
||||||
pat = regex.compile(prefs.get('filename_pattern'), flags=regex.UNICODE | regex.VERSION0 | regex.FULLCASE)
|
pat = regex.compile(prefs.get('filename_pattern'), flags=regex.UNICODE | regex.VERSION0 | regex.FULLCASE)
|
||||||
except Exception:
|
except Exception:
|
||||||
pat = regex.compile('(?P<title>.+) - (?P<author>[^_]+)', flags=regex.UNICODE | regex.VERSION0 | regex.FULLCASE)
|
pat = regex.compile(r'(?P<title>.+) - (?P<author>[^_]+)', flags=regex.UNICODE | regex.VERSION0 | regex.FULLCASE)
|
||||||
|
|
||||||
name = name.replace('_', ' ')
|
name = name.replace('_', ' ')
|
||||||
match = pat.search(name)
|
match = pat.search(name)
|
||||||
|
@ -59,4 +59,4 @@ def set_metadata(stream, mi):
|
|||||||
MetadataWriter(stream, mi)
|
MetadataWriter(stream, mi)
|
||||||
|
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
stream.write(re.sub('[^-A-Za-z0-9 ]+', '_', mi.title).ljust(31, '\x00')[:31].encode('ascii', 'replace') + b'\x00')
|
stream.write(re.sub(r'[^-A-Za-z0-9 ]+', '_', mi.title).ljust(31, '\x00')[:31].encode('ascii', 'replace') + b'\x00')
|
||||||
|
@ -365,7 +365,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
r'([0-9.,]+) ?(out of|von|van|su|étoiles sur|つ星のうち|de un máximo de|de|av) '
|
r'([0-9.,]+) ?(out of|von|van|su|étoiles sur|つ星のうち|de un máximo de|de|av) '
|
||||||
r'([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas|sterren|stjärnor)){0,1}'
|
r'([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas|sterren|stjärnor)){0,1}'
|
||||||
)
|
)
|
||||||
self.ratings_pat_cn = re.compile('([0-9.]+) 颗星,最多 5 颗星')
|
self.ratings_pat_cn = re.compile(r'([0-9.]+) 颗星,最多 5 颗星')
|
||||||
self.ratings_pat_jp = re.compile(r'\d+つ星のうち([\d\.]+)')
|
self.ratings_pat_jp = re.compile(r'\d+つ星のうち([\d\.]+)')
|
||||||
|
|
||||||
lm = {
|
lm = {
|
||||||
|
@ -165,7 +165,7 @@ def wayback_url_processor(url):
|
|||||||
if url.startswith('/'):
|
if url.startswith('/'):
|
||||||
# Use original URL instead of absolutizing to wayback URL as wayback is
|
# Use original URL instead of absolutizing to wayback URL as wayback is
|
||||||
# slow
|
# slow
|
||||||
m = re.search('https?:', url)
|
m = re.search(r'https?:', url)
|
||||||
if m is None:
|
if m is None:
|
||||||
url = 'https://web.archive.org' + url
|
url = 'https://web.archive.org' + url
|
||||||
else:
|
else:
|
||||||
|
@ -380,7 +380,7 @@ class MobiReader:
|
|||||||
self.processed_html = re.sub(
|
self.processed_html = re.sub(
|
||||||
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html)
|
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html)
|
||||||
bods = htmls = 0
|
bods = htmls = 0
|
||||||
for x in re.finditer('</body>|</html>', self.processed_html):
|
for x in re.finditer(r'</body>|</html>', self.processed_html):
|
||||||
if x == '</body>':
|
if x == '</body>':
|
||||||
bods +=1
|
bods +=1
|
||||||
else:
|
else:
|
||||||
|
@ -155,7 +155,7 @@ def hfix(name, raw):
|
|||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
|
||||||
CLI_HELP = {x:hfix(x, re.sub('<.*?>', '', y)) for x, y in iteritems(HELP)}
|
CLI_HELP = {x:hfix(x, re.sub(r'<.*?>', '', y)) for x, y in iteritems(HELP)}
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ class Patterns:
|
|||||||
# French words with prefixes are reduced to the stem word, so that the
|
# French words with prefixes are reduced to the stem word, so that the
|
||||||
# words appear only once in the word list
|
# words appear only once in the word list
|
||||||
self.fr_elision_pat = regex.compile(
|
self.fr_elision_pat = regex.compile(
|
||||||
"^(?:l|d|m|t|s|j|c|ç|lorsqu|puisqu|quoiqu|qu)['’]", flags=regex.UNICODE | regex.VERSION1 | regex.IGNORECASE)
|
r"^(?:l|d|m|t|s|j|c|ç|lorsqu|puisqu|quoiqu|qu)['’]", flags=regex.UNICODE | regex.VERSION1 | regex.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
def patterns():
|
def patterns():
|
||||||
|
@ -102,7 +102,7 @@ class SVGRasterizer:
|
|||||||
|
|
||||||
if view_box is not None:
|
if view_box is not None:
|
||||||
try:
|
try:
|
||||||
box = [float(x) for x in filter(None, re.split('[, ]', view_box))]
|
box = [float(x) for x in filter(None, re.split(r'[, ]', view_box))]
|
||||||
sizes = [box[2]-box[0], box[3] - box[1]]
|
sizes = [box[2]-box[0], box[3] - box[1]]
|
||||||
except (TypeError, ValueError, IndexError):
|
except (TypeError, ValueError, IndexError):
|
||||||
logger.warn('SVG image has invalid viewBox="%s", ignoring the viewBox' % view_box)
|
logger.warn('SVG image has invalid viewBox="%s", ignoring the viewBox' % view_box)
|
||||||
|
@ -152,7 +152,7 @@ def flip_image(img, flip):
|
|||||||
|
|
||||||
|
|
||||||
def flip_images(raw):
|
def flip_images(raw):
|
||||||
for match in re.finditer('<IMG[^>]+/?>', raw, flags=re.I):
|
for match in re.finditer(r'<IMG[^>]+/?>', raw, flags=re.I):
|
||||||
img = match.group()
|
img = match.group()
|
||||||
m = re.search(r'class="(x|y|xy)flip"', img)
|
m = re.search(r'class="(x|y|xy)flip"', img)
|
||||||
if m is None:
|
if m is None:
|
||||||
@ -174,5 +174,5 @@ def flip_images(raw):
|
|||||||
counter += 1
|
counter += 1
|
||||||
return m.group(1).rstrip('/') + f' alt="Image {counter}"/>'
|
return m.group(1).rstrip('/') + f' alt="Image {counter}"/>'
|
||||||
|
|
||||||
raw = re.sub('(<IMG[^>]+)/?>', add_alt, raw, flags=re.I)
|
raw = re.sub(r'(<IMG[^>]+)/?>', add_alt, raw, flags=re.I)
|
||||||
return raw
|
return raw
|
||||||
|
@ -121,7 +121,7 @@ class Font:
|
|||||||
self.metrics, self.compress = metrics, compress
|
self.metrics, self.compress = metrics, compress
|
||||||
self.is_otf = self.metrics.is_otf
|
self.is_otf = self.metrics.is_otf
|
||||||
self.subset_tag = str(
|
self.subset_tag = str(
|
||||||
re.sub('.', lambda m: codepoint_to_chr(int(m.group())+ord('A')), oct(num).replace('o', '')
|
re.sub(r'.', lambda m: codepoint_to_chr(int(m.group())+ord('A')), oct(num).replace('o', '')
|
||||||
)).rjust(6, 'A')
|
)).rjust(6, 'A')
|
||||||
self.font_stream = FontStream(metrics.is_otf, compress=compress)
|
self.font_stream = FontStream(metrics.is_otf, compress=compress)
|
||||||
try:
|
try:
|
||||||
|
@ -199,11 +199,11 @@ class PMLMLizer:
|
|||||||
text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)
|
text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)
|
||||||
|
|
||||||
# Remove excess spaces at beginning and end of lines
|
# Remove excess spaces at beginning and end of lines
|
||||||
text = re.sub('(?m)^[ ]+', '', text)
|
text = re.sub(r'(?m)^[ ]+', '', text)
|
||||||
text = re.sub('(?m)[ ]+$', '', text)
|
text = re.sub(r'(?m)[ ]+$', '', text)
|
||||||
|
|
||||||
# Remove excessive spaces
|
# Remove excessive spaces
|
||||||
text = re.sub('[ ]{2,}', ' ', text)
|
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||||
|
|
||||||
# Condense excessive \c empty line sequences.
|
# Condense excessive \c empty line sequences.
|
||||||
text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text)
|
text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text)
|
||||||
@ -213,7 +213,7 @@ class PMLMLizer:
|
|||||||
if self.opts.remove_paragraph_spacing:
|
if self.opts.remove_paragraph_spacing:
|
||||||
text = re.sub('\n{2,}', '\n', text)
|
text = re.sub('\n{2,}', '\n', text)
|
||||||
# Only indent lines that don't have special formatting
|
# Only indent lines that don't have special formatting
|
||||||
text = re.sub('(?imu)^(?P<text>.+)$', lambda mo: mo.group('text')
|
text = re.sub(r'(?imu)^(?P<text>.+)$', lambda mo: mo.group('text')
|
||||||
if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text)
|
if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text)
|
||||||
else:
|
else:
|
||||||
text = re.sub('\n{3,}', '\n\n', text)
|
text = re.sub('\n{3,}', '\n\n', text)
|
||||||
|
@ -19,11 +19,11 @@ def tounicode(tree_or_node, **kwargs):
|
|||||||
|
|
||||||
|
|
||||||
REGEXES = {
|
REGEXES = {
|
||||||
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I), # noqa: E501
|
'unlikelyCandidatesRe': re.compile(r'combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I), # noqa: E501
|
||||||
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
|
'okMaybeItsACandidateRe': re.compile(r'and|article|body|column|main|shadow',re.I),
|
||||||
'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
|
'positiveRe': re.compile(r'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
|
||||||
'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I), # noqa: E501
|
'negativeRe': re.compile(r'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I), # noqa: E501
|
||||||
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
|
'divToPElementsRe': re.compile(r'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
|
||||||
# 'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
|
# 'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
|
||||||
# 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
|
# 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
|
||||||
# 'trimRe': re.compile('^\s+|\s+$/'),
|
# 'trimRe': re.compile('^\s+|\s+$/'),
|
||||||
|
@ -121,7 +121,7 @@ class RTFMLizer:
|
|||||||
self.log.debug('Converting %s to RTF markup...' % item.href)
|
self.log.debug('Converting %s to RTF markup...' % item.href)
|
||||||
# Removing comments is needed as comments with -- inside them can
|
# Removing comments is needed as comments with -- inside them can
|
||||||
# cause fromstring() to fail
|
# cause fromstring() to fail
|
||||||
content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL)
|
content = re.sub(r'<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL)
|
||||||
content = self.remove_newlines(content)
|
content = self.remove_newlines(content)
|
||||||
content = self.remove_tabs(content)
|
content = self.remove_tabs(content)
|
||||||
content = safe_xml_fromstring(content)
|
content = safe_xml_fromstring(content)
|
||||||
@ -198,7 +198,7 @@ class RTFMLizer:
|
|||||||
text = re.sub('%s{3,}' % os.linesep, f'{os.linesep}{os.linesep}', text)
|
text = re.sub('%s{3,}' % os.linesep, f'{os.linesep}{os.linesep}', text)
|
||||||
|
|
||||||
# Remove excessive spaces
|
# Remove excessive spaces
|
||||||
text = re.sub('[ ]{2,}', ' ', text)
|
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||||
text = re.sub('\t{2,}', '\t', text)
|
text = re.sub('\t{2,}', '\t', text)
|
||||||
text = text.replace('\t ', '\t')
|
text = text.replace('\t ', '\t')
|
||||||
|
|
||||||
|
@ -652,7 +652,7 @@ class ProcessTokens:
|
|||||||
return f'cw<{pre}<{token}<nu<{type}\n'
|
return f'cw<{pre}<{token}<nu<{type}\n'
|
||||||
|
|
||||||
def __language_func(self, pre, token, num):
|
def __language_func(self, pre, token, num):
|
||||||
lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
|
lang_name = self.__language_dict.get(int(re.search(r'[0-9]+', num).group()))
|
||||||
if not lang_name:
|
if not lang_name:
|
||||||
lang_name = 'not defined'
|
lang_name = 'not defined'
|
||||||
if self.__run_level > 3:
|
if self.__run_level > 3:
|
||||||
|
@ -165,13 +165,13 @@ class SNBMLizer:
|
|||||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||||
if self.opts.remove_paragraph_spacing:
|
if self.opts.remove_paragraph_spacing:
|
||||||
text = re.sub('\n{2,}', '\n', text)
|
text = re.sub('\n{2,}', '\n', text)
|
||||||
text = re.sub('(?imu)^(?=.)', '\t', text)
|
text = re.sub(r'(?imu)^(?=.)', '\t', text)
|
||||||
else:
|
else:
|
||||||
text = re.sub('\n{3,}', '\n\n', text)
|
text = re.sub('\n{3,}', '\n\n', text)
|
||||||
|
|
||||||
# Replace spaces at the beginning and end of lines
|
# Replace spaces at the beginning and end of lines
|
||||||
text = re.sub('(?imu)^[ ]+', '', text)
|
text = re.sub(r'(?imu)^[ ]+', '', text)
|
||||||
text = re.sub('(?imu)[ ]+$', '', text)
|
text = re.sub(r'(?imu)[ ]+$', '', text)
|
||||||
|
|
||||||
if self.opts.snb_max_line_length:
|
if self.opts.snb_max_line_length:
|
||||||
max_length = self.opts.snb_max_line_length
|
max_length = self.opts.snb_max_line_length
|
||||||
|
@ -6,117 +6,117 @@ import re
|
|||||||
|
|
||||||
|
|
||||||
def unsmarten(txt):
|
def unsmarten(txt):
|
||||||
txt = re.sub('¢|¢|¢', r'{c\}', txt) # cent
|
txt = re.sub(r'¢|¢|¢', r'{c\}', txt) # cent
|
||||||
txt = re.sub('£|£|£', r'{L-}', txt) # pound
|
txt = re.sub(r'£|£|£', r'{L-}', txt) # pound
|
||||||
txt = re.sub('¥|¥|¥', r'{Y=}', txt) # yen
|
txt = re.sub(r'¥|¥|¥', r'{Y=}', txt) # yen
|
||||||
txt = re.sub('©|©|©', r'{(c)}', txt) # copyright
|
txt = re.sub(r'©|©|©', r'{(c)}', txt) # copyright
|
||||||
txt = re.sub('®|®|®', r'{(r)}', txt) # registered
|
txt = re.sub(r'®|®|®', r'{(r)}', txt) # registered
|
||||||
txt = re.sub('¼|¼|¼', r'{1/4}', txt) # quarter
|
txt = re.sub(r'¼|¼|¼', r'{1/4}', txt) # quarter
|
||||||
txt = re.sub('½|½|½', r'{1/2}', txt) # half
|
txt = re.sub(r'½|½|½', r'{1/2}', txt) # half
|
||||||
txt = re.sub('¾|¾|¾', r'{3/4}', txt) # three-quarter
|
txt = re.sub(r'¾|¾|¾', r'{3/4}', txt) # three-quarter
|
||||||
txt = re.sub('À|À|À', r'{A`)}', txt) # A-grave
|
txt = re.sub(r'À|À|À', r'{A`)}', txt) # A-grave
|
||||||
txt = re.sub('Á|Á|Á', r"{A'}", txt) # A-acute
|
txt = re.sub(r'Á|Á|Á', r"{A'}", txt) # A-acute
|
||||||
txt = re.sub('Â|Â|Â', r'{A^}', txt) # A-circumflex
|
txt = re.sub(r'Â|Â|Â', r'{A^}', txt) # A-circumflex
|
||||||
txt = re.sub('Ã|Ã|Ã', r'{A~}', txt) # A-tilde
|
txt = re.sub(r'Ã|Ã|Ã', r'{A~}', txt) # A-tilde
|
||||||
txt = re.sub('Ä|Ä|Ä', r'{A"}', txt) # A-umlaut
|
txt = re.sub(r'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut
|
||||||
txt = re.sub('Å|Å|Å', r'{Ao}', txt) # A-ring
|
txt = re.sub(r'Å|Å|Å', r'{Ao}', txt) # A-ring
|
||||||
txt = re.sub('Æ|Æ|Æ', r'{AE}', txt) # AE
|
txt = re.sub(r'Æ|Æ|Æ', r'{AE}', txt) # AE
|
||||||
txt = re.sub('Ç|Ç|Ç', r'{C,}', txt) # C-cedilla
|
txt = re.sub(r'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla
|
||||||
txt = re.sub('È|È|È', r'{E`}', txt) # E-grave
|
txt = re.sub(r'È|È|È', r'{E`}', txt) # E-grave
|
||||||
txt = re.sub('É|É|É', r"{E'}", txt) # E-acute
|
txt = re.sub(r'É|É|É', r"{E'}", txt) # E-acute
|
||||||
txt = re.sub('Ê|Ê|Ê', r'{E^}', txt) # E-circumflex
|
txt = re.sub(r'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex
|
||||||
txt = re.sub('Ë|Ë|Ë', r'{E"}', txt) # E-umlaut
|
txt = re.sub(r'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut
|
||||||
txt = re.sub('Ì|Ì|Ì', r'{I`}', txt) # I-grave
|
txt = re.sub(r'Ì|Ì|Ì', r'{I`}', txt) # I-grave
|
||||||
txt = re.sub('Í|Í|Í', r"{I'}", txt) # I-acute
|
txt = re.sub(r'Í|Í|Í', r"{I'}", txt) # I-acute
|
||||||
txt = re.sub('Î|Î|Î', r'{I^}', txt) # I-circumflex
|
txt = re.sub(r'Î|Î|Î', r'{I^}', txt) # I-circumflex
|
||||||
txt = re.sub('Ï|Ï|Ï', r'{I"}', txt) # I-umlaut
|
txt = re.sub(r'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut
|
||||||
txt = re.sub('Ð|Ð|Ð', r'{D-}', txt) # ETH
|
txt = re.sub(r'Ð|Ð|Ð', r'{D-}', txt) # ETH
|
||||||
txt = re.sub('Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde
|
txt = re.sub(r'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde
|
||||||
txt = re.sub('Ò|Ò|Ò', r'{O`}', txt) # O-grave
|
txt = re.sub(r'Ò|Ò|Ò', r'{O`}', txt) # O-grave
|
||||||
txt = re.sub('Ó|Ó|Ó', r"{O'}", txt) # O-acute
|
txt = re.sub(r'Ó|Ó|Ó', r"{O'}", txt) # O-acute
|
||||||
txt = re.sub('Ô|Ô|Ô', r'{O^}', txt) # O-circumflex
|
txt = re.sub(r'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex
|
||||||
txt = re.sub('Õ|Õ|Õ', r'{O~}', txt) # O-tilde
|
txt = re.sub(r'Õ|Õ|Õ', r'{O~}', txt) # O-tilde
|
||||||
txt = re.sub('Ö|Ö|Ö', r'{O"}', txt) # O-umlaut
|
txt = re.sub(r'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut
|
||||||
txt = re.sub('×|×|×', r'{x}', txt) # dimension
|
txt = re.sub(r'×|×|×', r'{x}', txt) # dimension
|
||||||
txt = re.sub('Ø|Ø|Ø', r'{O/}', txt) # O-slash
|
txt = re.sub(r'Ø|Ø|Ø', r'{O/}', txt) # O-slash
|
||||||
txt = re.sub('Ù|Ù|Ù', r'{U`}', txt) # U-grave
|
txt = re.sub(r'Ù|Ù|Ù', r'{U`}', txt) # U-grave
|
||||||
txt = re.sub('Ú|Ú|Ú', r"{U'}", txt) # U-acute
|
txt = re.sub(r'Ú|Ú|Ú', r"{U'}", txt) # U-acute
|
||||||
txt = re.sub('Û|Û|Û', r'{U^}', txt) # U-circumflex
|
txt = re.sub(r'Û|Û|Û', r'{U^}', txt) # U-circumflex
|
||||||
txt = re.sub('Ü|Ü|Ü', r'{U"}', txt) # U-umlaut
|
txt = re.sub(r'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut
|
||||||
txt = re.sub('Ý|Ý|Ý', r"{Y'}", txt) # Y-grave
|
txt = re.sub(r'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave
|
||||||
txt = re.sub('ß|ß|ß', r'{sz}', txt) # sharp-s
|
txt = re.sub(r'ß|ß|ß', r'{sz}', txt) # sharp-s
|
||||||
txt = re.sub('à|à|à', r'{a`}', txt) # a-grave
|
txt = re.sub(r'à|à|à', r'{a`}', txt) # a-grave
|
||||||
txt = re.sub('á|á|á', r"{a'}", txt) # a-acute
|
txt = re.sub(r'á|á|á', r"{a'}", txt) # a-acute
|
||||||
txt = re.sub('â|â|â', r'{a^}', txt) # a-circumflex
|
txt = re.sub(r'â|â|â', r'{a^}', txt) # a-circumflex
|
||||||
txt = re.sub('ã|ã|ã', r'{a~}', txt) # a-tilde
|
txt = re.sub(r'ã|ã|ã', r'{a~}', txt) # a-tilde
|
||||||
txt = re.sub('ä|ä|ä', r'{a"}', txt) # a-umlaut
|
txt = re.sub(r'ä|ä|ä', r'{a"}', txt) # a-umlaut
|
||||||
txt = re.sub('å|å|å', r'{ao}', txt) # a-ring
|
txt = re.sub(r'å|å|å', r'{ao}', txt) # a-ring
|
||||||
txt = re.sub('æ|æ|æ', r'{ae}', txt) # ae
|
txt = re.sub(r'æ|æ|æ', r'{ae}', txt) # ae
|
||||||
txt = re.sub('ç|ç|ç', r'{c,}', txt) # c-cedilla
|
txt = re.sub(r'ç|ç|ç', r'{c,}', txt) # c-cedilla
|
||||||
txt = re.sub('è|è|è', r'{e`}', txt) # e-grave
|
txt = re.sub(r'è|è|è', r'{e`}', txt) # e-grave
|
||||||
txt = re.sub('é|é|é', r"{e'}", txt) # e-acute
|
txt = re.sub(r'é|é|é', r"{e'}", txt) # e-acute
|
||||||
txt = re.sub('ê|ê|ê', r'{e^}', txt) # e-circumflex
|
txt = re.sub(r'ê|ê|ê', r'{e^}', txt) # e-circumflex
|
||||||
txt = re.sub('ë|ë|ë', r'{e"}', txt) # e-umlaut
|
txt = re.sub(r'ë|ë|ë', r'{e"}', txt) # e-umlaut
|
||||||
txt = re.sub('ì|ì|ì', r'{i`}', txt) # i-grave
|
txt = re.sub(r'ì|ì|ì', r'{i`}', txt) # i-grave
|
||||||
txt = re.sub('í|í|í', r"{i'}", txt) # i-acute
|
txt = re.sub(r'í|í|í', r"{i'}", txt) # i-acute
|
||||||
txt = re.sub('î|î|î', r'{i^}', txt) # i-circumflex
|
txt = re.sub(r'î|î|î', r'{i^}', txt) # i-circumflex
|
||||||
txt = re.sub('ï|ï|ï', r'{i"}', txt) # i-umlaut
|
txt = re.sub(r'ï|ï|ï', r'{i"}', txt) # i-umlaut
|
||||||
txt = re.sub('ð|ð|ð', r'{d-}', txt) # eth
|
txt = re.sub(r'ð|ð|ð', r'{d-}', txt) # eth
|
||||||
txt = re.sub('ñ|ñ|ñ', r'{n~}', txt) # n-tilde
|
txt = re.sub(r'ñ|ñ|ñ', r'{n~}', txt) # n-tilde
|
||||||
txt = re.sub('ò|ò|ò', r'{o`}', txt) # o-grave
|
txt = re.sub(r'ò|ò|ò', r'{o`}', txt) # o-grave
|
||||||
txt = re.sub('ó|ó|ó', r"{o'}", txt) # o-acute
|
txt = re.sub(r'ó|ó|ó', r"{o'}", txt) # o-acute
|
||||||
txt = re.sub('ô|ô|ô', r'{o^}', txt) # o-circumflex
|
txt = re.sub(r'ô|ô|ô', r'{o^}', txt) # o-circumflex
|
||||||
txt = re.sub('õ|õ|õ', r'{o~}', txt) # o-tilde
|
txt = re.sub(r'õ|õ|õ', r'{o~}', txt) # o-tilde
|
||||||
txt = re.sub('ö|ö|ö', r'{o"}', txt) # o-umlaut
|
txt = re.sub(r'ö|ö|ö', r'{o"}', txt) # o-umlaut
|
||||||
txt = re.sub('ø|ø|ø', r'{o/}', txt) # o-stroke
|
txt = re.sub(r'ø|ø|ø', r'{o/}', txt) # o-stroke
|
||||||
txt = re.sub('ù|ù|ù', r'{u`}', txt) # u-grave
|
txt = re.sub(r'ù|ù|ù', r'{u`}', txt) # u-grave
|
||||||
txt = re.sub('ú|ú|ú', r"{u'}", txt) # u-acute
|
txt = re.sub(r'ú|ú|ú', r"{u'}", txt) # u-acute
|
||||||
txt = re.sub('û|û|û', r'{u^}', txt) # u-circumflex
|
txt = re.sub(r'û|û|û', r'{u^}', txt) # u-circumflex
|
||||||
txt = re.sub('ü|ü|ü', r'{u"}', txt) # u-umlaut
|
txt = re.sub(r'ü|ü|ü', r'{u"}', txt) # u-umlaut
|
||||||
txt = re.sub('ý|ý|ý', r"{y'}", txt) # y-acute
|
txt = re.sub(r'ý|ý|ý', r"{y'}", txt) # y-acute
|
||||||
txt = re.sub('ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
txt = re.sub(r'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
||||||
|
|
||||||
txt = re.sub('Č|Č|Č', r'{Cˇ}', txt) # C-caron
|
txt = re.sub(r'Č|Č|Č', r'{Cˇ}', txt) # C-caron
|
||||||
txt = re.sub('č|č|č', r'{cˇ}', txt) # c-caron
|
txt = re.sub(r'č|č|č', r'{cˇ}', txt) # c-caron
|
||||||
txt = re.sub('Ď|Ď|Ď', r'{Dˇ}', txt) # D-caron
|
txt = re.sub(r'Ď|Ď|Ď', r'{Dˇ}', txt) # D-caron
|
||||||
txt = re.sub('ď|ď|ď', r'{dˇ}', txt) # d-caron
|
txt = re.sub(r'ď|ď|ď', r'{dˇ}', txt) # d-caron
|
||||||
txt = re.sub('Ě|Ě|Ě', r'{Eˇ}', txt) # E-caron
|
txt = re.sub(r'Ě|Ě|Ě', r'{Eˇ}', txt) # E-caron
|
||||||
txt = re.sub('ě|ě|ě', r'{eˇ}', txt) # e-caron
|
txt = re.sub(r'ě|ě|ě', r'{eˇ}', txt) # e-caron
|
||||||
txt = re.sub('Ĺ|Ĺ|Ĺ', r"{L'}", txt) # L-acute
|
txt = re.sub(r'Ĺ|Ĺ|Ĺ', r"{L'}", txt) # L-acute
|
||||||
txt = re.sub('ĺ|ĺ|ĺ', r"{l'}", txt) # l-acute
|
txt = re.sub(r'ĺ|ĺ|ĺ', r"{l'}", txt) # l-acute
|
||||||
txt = re.sub('Ľ|Ľ|Ľ', r'{Lˇ}', txt) # L-caron
|
txt = re.sub(r'Ľ|Ľ|Ľ', r'{Lˇ}', txt) # L-caron
|
||||||
txt = re.sub('ľ|ľ|ľ', r'{lˇ}', txt) # l-caron
|
txt = re.sub(r'ľ|ľ|ľ', r'{lˇ}', txt) # l-caron
|
||||||
txt = re.sub('Ň|Ň|Ň', r'{Nˇ}', txt) # N-caron
|
txt = re.sub(r'Ň|Ň|Ň', r'{Nˇ}', txt) # N-caron
|
||||||
txt = re.sub('ň|ň|ň', r'{nˇ}', txt) # n-caron
|
txt = re.sub(r'ň|ň|ň', r'{nˇ}', txt) # n-caron
|
||||||
|
|
||||||
txt = re.sub('Œ|Œ|Œ', r'{OE}', txt) # OE
|
txt = re.sub(r'Œ|Œ|Œ', r'{OE}', txt) # OE
|
||||||
txt = re.sub('œ|œ|œ', r'{oe}', txt) # oe
|
txt = re.sub(r'œ|œ|œ', r'{oe}', txt) # oe
|
||||||
|
|
||||||
txt = re.sub('Ŕ|Ŕ|Ŕ', r"{R'}", txt) # R-acute
|
txt = re.sub(r'Ŕ|Ŕ|Ŕ', r"{R'}", txt) # R-acute
|
||||||
txt = re.sub('ŕ|ŕ|ŕ', r"{r'}", txt) # r-acute
|
txt = re.sub(r'ŕ|ŕ|ŕ', r"{r'}", txt) # r-acute
|
||||||
txt = re.sub('Ř|Ř|Ř', r'{Rˇ}', txt) # R-caron
|
txt = re.sub(r'Ř|Ř|Ř', r'{Rˇ}', txt) # R-caron
|
||||||
txt = re.sub('ř|ř|ř', r'{rˇ}', txt) # r-caron
|
txt = re.sub(r'ř|ř|ř', r'{rˇ}', txt) # r-caron
|
||||||
txt = re.sub('Ŝ|Ŝ', r'{S^}', txt) # S-circumflex
|
txt = re.sub(r'Ŝ|Ŝ', r'{S^}', txt) # S-circumflex
|
||||||
txt = re.sub('ŝ|ŝ', r'{s^}', txt) # s-circumflex
|
txt = re.sub(r'ŝ|ŝ', r'{s^}', txt) # s-circumflex
|
||||||
txt = re.sub('Š|Š|Š', r'{Sˇ}', txt) # S-caron
|
txt = re.sub(r'Š|Š|Š', r'{Sˇ}', txt) # S-caron
|
||||||
txt = re.sub('š|š|š', r'{sˇ}', txt) # s-caron
|
txt = re.sub(r'š|š|š', r'{sˇ}', txt) # s-caron
|
||||||
txt = re.sub('Ť|Ť|Ť', r'{Tˇ}', txt) # T-caron
|
txt = re.sub(r'Ť|Ť|Ť', r'{Tˇ}', txt) # T-caron
|
||||||
txt = re.sub('ť|ť|ť', r'{tˇ}', txt) # t-caron
|
txt = re.sub(r'ť|ť|ť', r'{tˇ}', txt) # t-caron
|
||||||
txt = re.sub('Ů|Ů|Ů', r'{U°}', txt) # U-ring
|
txt = re.sub(r'Ů|Ů|Ů', r'{U°}', txt) # U-ring
|
||||||
txt = re.sub('ů|ů|ů', r'{u°}', txt) # u-ring
|
txt = re.sub(r'ů|ů|ů', r'{u°}', txt) # u-ring
|
||||||
txt = re.sub('Ž|Ž|Ž', r'{Zˇ}', txt) # Z-caron
|
txt = re.sub(r'Ž|Ž|Ž', r'{Zˇ}', txt) # Z-caron
|
||||||
txt = re.sub('ž|ž|ž', r'{zˇ}', txt) # z-caron
|
txt = re.sub(r'ž|ž|ž', r'{zˇ}', txt) # z-caron
|
||||||
|
|
||||||
txt = re.sub('•|•|•', r'{*}', txt) # bullet
|
txt = re.sub(r'•|•|•', r'{*}', txt) # bullet
|
||||||
txt = re.sub('₣|₣', r'{Fr}', txt) # Franc
|
txt = re.sub(r'₣|₣', r'{Fr}', txt) # Franc
|
||||||
txt = re.sub('₤|₤', r'{L=}', txt) # Lira
|
txt = re.sub(r'₤|₤', r'{L=}', txt) # Lira
|
||||||
txt = re.sub('₨|₨', r'{Rs}', txt) # Rupee
|
txt = re.sub(r'₨|₨', r'{Rs}', txt) # Rupee
|
||||||
txt = re.sub('€|€|€', r'{C=}', txt) # euro
|
txt = re.sub(r'€|€|€', r'{C=}', txt) # euro
|
||||||
txt = re.sub('™|™|™', r'{tm}', txt) # trademark
|
txt = re.sub(r'™|™|™', r'{tm}', txt) # trademark
|
||||||
txt = re.sub('♠|♠|♠', r'{spade}', txt) # spade
|
txt = re.sub(r'♠|♠|♠', r'{spade}', txt) # spade
|
||||||
txt = re.sub('♣|♣|♣', r'{club}', txt) # club
|
txt = re.sub(r'♣|♣|♣', r'{club}', txt) # club
|
||||||
txt = re.sub('♥|♥|♥', r'{heart}', txt) # heart
|
txt = re.sub(r'♥|♥|♥', r'{heart}', txt) # heart
|
||||||
txt = re.sub('♦|♦|♦', r'{diamond}', txt) # diamond
|
txt = re.sub(r'♦|♦|♦', r'{diamond}', txt) # diamond
|
||||||
|
|
||||||
# Move into main code?
|
# Move into main code?
|
||||||
# txt = re.sub('\xa0', r'p. ', txt) # blank paragraph
|
# txt = re.sub('\xa0', r'p. ', txt) # blank paragraph
|
||||||
|
@ -51,9 +51,9 @@ class MarkdownMLizer(OEB2HTML):
|
|||||||
|
|
||||||
def tidy_up(self, text):
|
def tidy_up(self, text):
|
||||||
# Remove blank space form beginning of paragraph.
|
# Remove blank space form beginning of paragraph.
|
||||||
text = re.sub('(?msu)^[ ]{1,3}', '', text)
|
text = re.sub(r'(?msu)^[ ]{1,3}', '', text)
|
||||||
# pre has 4 spaces. We trimmed 3 so anything with a space left is a pre.
|
# pre has 4 spaces. We trimmed 3 so anything with a space left is a pre.
|
||||||
text = re.sub('(?msu)^[ ]', ' ', text)
|
text = re.sub(r'(?msu)^[ ]', ' ', text)
|
||||||
|
|
||||||
# Remove tabs that aren't at the beginning of a line
|
# Remove tabs that aren't at the beginning of a line
|
||||||
new_text = []
|
new_text = []
|
||||||
@ -68,7 +68,7 @@ class MarkdownMLizer(OEB2HTML):
|
|||||||
text = '\n'.join(new_text)
|
text = '\n'.join(new_text)
|
||||||
|
|
||||||
# Remove spaces from blank lines.
|
# Remove spaces from blank lines.
|
||||||
text = re.sub('(?msu)^[ ]+$', '', text)
|
text = re.sub(r'(?msu)^[ ]+$', '', text)
|
||||||
|
|
||||||
# Reduce blank lines
|
# Reduce blank lines
|
||||||
text = re.sub('(?msu)\n{7,}', '\n' * 6, text)
|
text = re.sub('(?msu)\n{7,}', '\n' * 6, text)
|
||||||
|
@ -34,7 +34,7 @@ def clean_txt(txt):
|
|||||||
txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt)
|
txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt)
|
||||||
|
|
||||||
# Condense redundant spaces
|
# Condense redundant spaces
|
||||||
txt = re.sub('[ ]{2,}', ' ', txt)
|
txt = re.sub(r'[ ]{2,}', ' ', txt)
|
||||||
|
|
||||||
# Remove blank space from the beginning and end of the document.
|
# Remove blank space from the beginning and end of the document.
|
||||||
txt = re.sub(r'^\s+(?=.)', '', txt)
|
txt = re.sub(r'^\s+(?=.)', '', txt)
|
||||||
@ -213,7 +213,7 @@ def preserve_spaces(txt):
|
|||||||
'''
|
'''
|
||||||
Replaces spaces multiple spaces with entities.
|
Replaces spaces multiple spaces with entities.
|
||||||
'''
|
'''
|
||||||
txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt)
|
txt = re.sub(r'(?P<space>[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt)
|
||||||
txt = txt.replace('\t', ' ')
|
txt = txt.replace('\t', ' ')
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
@ -325,9 +325,9 @@ def detect_formatting_type(txt):
|
|||||||
|
|
||||||
# Check for markdown
|
# Check for markdown
|
||||||
# Headings
|
# Headings
|
||||||
markdown_count += len(re.findall('(?mu)^#+', txt))
|
markdown_count += len(re.findall(r'(?mu)^#+', txt))
|
||||||
markdown_count += len(re.findall('(?mu)^=+$', txt))
|
markdown_count += len(re.findall(r'(?mu)^=+$', txt))
|
||||||
markdown_count += len(re.findall('(?mu)^-+$', txt))
|
markdown_count += len(re.findall(r'(?mu)^-+$', txt))
|
||||||
# Images
|
# Images
|
||||||
markdown_count += len(re.findall(r'(?u)!\[.*?\](\[|\()', txt))
|
markdown_count += len(re.findall(r'(?u)!\[.*?\](\[|\()', txt))
|
||||||
# Links
|
# Links
|
||||||
|
@ -126,7 +126,7 @@ class TXTMLizer:
|
|||||||
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
||||||
|
|
||||||
# Remove multiple spaces.
|
# Remove multiple spaces.
|
||||||
text = re.sub('[ ]{2,}', ' ', text)
|
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||||
|
|
||||||
# Remove excessive newlines.
|
# Remove excessive newlines.
|
||||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||||
@ -140,8 +140,8 @@ class TXTMLizer:
|
|||||||
# Replace spaces at the beginning and end of lines
|
# Replace spaces at the beginning and end of lines
|
||||||
# We don't replace tabs because those are only added
|
# We don't replace tabs because those are only added
|
||||||
# when remove paragraph spacing is enabled.
|
# when remove paragraph spacing is enabled.
|
||||||
text = re.sub('(?imu)^[ ]+', '', text)
|
text = re.sub(r'(?imu)^[ ]+', '', text)
|
||||||
text = re.sub('(?imu)[ ]+$', '', text)
|
text = re.sub(r'(?imu)[ ]+$', '', text)
|
||||||
|
|
||||||
# Remove empty space and newlines at the beginning of the document.
|
# Remove empty space and newlines at the beginning of the document.
|
||||||
text = re.sub(r'(?u)^[ \n]+', '', text)
|
text = re.sub(r'(?u)^[ \n]+', '', text)
|
||||||
|
@ -406,7 +406,7 @@ class SearchDialog(QDialog):
|
|||||||
self.resize(self.sizeHint())
|
self.resize(self.sizeHint())
|
||||||
|
|
||||||
def retrieve_template_search(self):
|
def retrieve_template_search(self):
|
||||||
template, sep, query = re.split('#@#:([tdnb]):', self.current_search_text, flags=re.IGNORECASE)
|
template, sep, query = re.split(r'#@#:([tdnb]):', self.current_search_text, flags=re.IGNORECASE)
|
||||||
self.template_value_box.setText(query)
|
self.template_value_box.setText(query)
|
||||||
cb = self.template_test_type_box
|
cb = self.template_test_type_box
|
||||||
for idx in range(0, cb.count()):
|
for idx in range(0, cb.count()):
|
||||||
|
@ -744,7 +744,7 @@ class CreateCustomColumn(QDialog):
|
|||||||
return self.simple_error('', _('The colors box must be empty or '
|
return self.simple_error('', _('The colors box must be empty or '
|
||||||
'contain the same number of items as the value box'))
|
'contain the same number of items as the value box'))
|
||||||
for tc in c:
|
for tc in c:
|
||||||
if tc not in QColor.colorNames() and not re.match('#(?:[0-9a-f]{3}){1,4}',tc,re.I):
|
if tc not in QColor.colorNames() and not re.match(r'#(?:[0-9a-f]{3}){1,4}',tc,re.I):
|
||||||
return self.simple_error('', _('The color {0} is unknown').format(tc))
|
return self.simple_error('', _('The color {0} is unknown').format(tc))
|
||||||
display_dict = {'enum_values': l, 'enum_colors': c}
|
display_dict = {'enum_values': l, 'enum_colors': c}
|
||||||
if default_val:
|
if default_val:
|
||||||
|
@ -146,7 +146,7 @@ class EmailAccounts(QAbstractTableModel): # {{{
|
|||||||
if aval:
|
if aval:
|
||||||
self.tags[account] = aval
|
self.tags[account] = aval
|
||||||
elif col == 1:
|
elif col == 1:
|
||||||
self.accounts[account][0] = re.sub(',+', ',', re.sub(r'\s+', ',', as_unicode(value or '').upper()))
|
self.accounts[account][0] = re.sub(r',+', ',', re.sub(r'\s+', ',', as_unicode(value or '').upper()))
|
||||||
elif col == 0:
|
elif col == 0:
|
||||||
na = as_unicode(value or '').strip()
|
na = as_unicode(value or '').strip()
|
||||||
from email.utils import parseaddr
|
from email.utils import parseaddr
|
||||||
|
@ -920,6 +920,6 @@ if __name__ == '__main__': # {{{
|
|||||||
|
|
||||||
def callback(ed):
|
def callback(ed):
|
||||||
import regex
|
import regex
|
||||||
ed.find_text(regex.compile('A bold word'))
|
ed.find_text(regex.compile(r'A bold word'))
|
||||||
launch_editor(raw, path_is_raw=True, syntax='html', callback=callback)
|
launch_editor(raw, path_is_raw=True, syntax='html', callback=callback)
|
||||||
# }}}
|
# }}}
|
||||||
|
@ -3828,7 +3828,7 @@ class CatalogBuilder:
|
|||||||
# if self.opts.numbers_as_text and re.match('[0-9]+',word[0]):
|
# if self.opts.numbers_as_text and re.match('[0-9]+',word[0]):
|
||||||
translated.append(NumberToText(word).text.capitalize())
|
translated.append(NumberToText(word).text.capitalize())
|
||||||
else:
|
else:
|
||||||
if re.match('[0-9]+', word[0]):
|
if re.match(r'[0-9]+', word[0]):
|
||||||
word = word.replace(',', '')
|
word = word.replace(',', '')
|
||||||
suffix = re.search(r'[\D]', word)
|
suffix = re.search(r'[\D]', word)
|
||||||
if suffix:
|
if suffix:
|
||||||
@ -3844,7 +3844,7 @@ class CatalogBuilder:
|
|||||||
translated.append(capitalize(word))
|
translated.append(capitalize(word))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if re.search('[0-9]+', word[0]):
|
if re.search(r'[0-9]+', word[0]):
|
||||||
word = word.replace(',', '')
|
word = word.replace(',', '')
|
||||||
suffix = re.search(r'[\D]', word)
|
suffix = re.search(r'[\D]', word)
|
||||||
if suffix:
|
if suffix:
|
||||||
@ -4114,7 +4114,7 @@ class CatalogBuilder:
|
|||||||
Return:
|
Return:
|
||||||
(str): char if A-z, else SYMBOLS
|
(str): char if A-z, else SYMBOLS
|
||||||
'''
|
'''
|
||||||
if not re.search('[a-zA-Z]', ascii_text(char)):
|
if not re.search(r'[a-zA-Z]', ascii_text(char)):
|
||||||
return self.SYMBOLS
|
return self.SYMBOLS
|
||||||
else:
|
else:
|
||||||
return char
|
return char
|
||||||
|
@ -87,7 +87,7 @@ class NumberToText: # {{{
|
|||||||
self.log('numberTranslate(): %s' % self.number)
|
self.log('numberTranslate(): %s' % self.number)
|
||||||
|
|
||||||
# Special case ordinals
|
# Special case ordinals
|
||||||
if re.search('[st|nd|rd|th]',self.number):
|
if re.search(r'[st|nd|rd|th]',self.number):
|
||||||
self.number = self.number.replace(',', '')
|
self.number = self.number.replace(',', '')
|
||||||
ordinal_suffix = re.search(r'[\D]', self.number)
|
ordinal_suffix = re.search(r'[\D]', self.number)
|
||||||
ordinal_number = re.sub(r'\D','',self.number.replace(',', ''))
|
ordinal_number = re.sub(r'\D','',self.number.replace(',', ''))
|
||||||
@ -134,7 +134,7 @@ class NumberToText: # {{{
|
|||||||
self.log('Hyphenated: %s' % self.number)
|
self.log('Hyphenated: %s' % self.number)
|
||||||
self.number_as_float = self.number.split('-')[0]
|
self.number_as_float = self.number.split('-')[0]
|
||||||
strings = self.number.split('-')
|
strings = self.number.split('-')
|
||||||
if re.search('[0-9]+', strings[0]):
|
if re.search(r'[0-9]+', strings[0]):
|
||||||
left = NumberToText(strings[0]).text
|
left = NumberToText(strings[0]).text
|
||||||
right = strings[1]
|
right = strings[1]
|
||||||
else:
|
else:
|
||||||
@ -143,7 +143,7 @@ class NumberToText: # {{{
|
|||||||
self.text = f'{left}-{right}'
|
self.text = f'{left}-{right}'
|
||||||
|
|
||||||
# Test for only commas and numbers
|
# Test for only commas and numbers
|
||||||
elif ',' in self.number and not re.search('[^0-9,]',self.number):
|
elif ',' in self.number and not re.search(r'[^0-9,]',self.number):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
self.log('Comma(s): %s' % self.number)
|
self.log('Comma(s): %s' % self.number)
|
||||||
self.number_as_float = self.number.replace(',', '')
|
self.number_as_float = self.number.replace(',', '')
|
||||||
|
@ -1504,11 +1504,11 @@ def text_to_tokens(text):
|
|||||||
text = match.group(1)
|
text = match.group(1)
|
||||||
OR = True
|
OR = True
|
||||||
tokens = []
|
tokens = []
|
||||||
quot = re.search('"(.*?)"', text)
|
quot = re.search(r'"(.*?)"', text)
|
||||||
while quot:
|
while quot:
|
||||||
tokens.append(quot.group(1))
|
tokens.append(quot.group(1))
|
||||||
text = text.replace('"'+quot.group(1)+'"', '')
|
text = text.replace('"'+quot.group(1)+'"', '')
|
||||||
quot = re.search('"(.*?)"', text)
|
quot = re.search(r'"(.*?)"', text)
|
||||||
tokens += text.split(' ')
|
tokens += text.split(' ')
|
||||||
ans = []
|
ans = []
|
||||||
for i in tokens:
|
for i in tokens:
|
||||||
|
@ -2556,7 +2556,7 @@ class BibTeX:
|
|||||||
self.invalid_cit = re.compile('[ "@\',\\#}{~%&$^]')
|
self.invalid_cit = re.compile('[ "@\',\\#}{~%&$^]')
|
||||||
self.upper = re.compile('[' +
|
self.upper = re.compile('[' +
|
||||||
string.ascii_uppercase + ']')
|
string.ascii_uppercase + ']')
|
||||||
self.escape = re.compile('[#&%_]')
|
self.escape = re.compile(r'[#&%_]')
|
||||||
|
|
||||||
def ValidateCitationKey(self, text):
|
def ValidateCitationKey(self, text):
|
||||||
'''
|
'''
|
||||||
|
@ -59,7 +59,7 @@ def get_opts_from_parser(parser, prefix):
|
|||||||
|
|
||||||
|
|
||||||
def send(ans):
|
def send(ans):
|
||||||
pat = re.compile('([^0-9a-zA-Z_./-])')
|
pat = re.compile(r'([^0-9a-zA-Z_./-])')
|
||||||
for x in sorted(set(ans)):
|
for x in sorted(set(ans)):
|
||||||
x = pat.sub(lambda m : '\\'+m.group(1), x)
|
x = pat.sub(lambda m : '\\'+m.group(1), x)
|
||||||
if x.endswith('\\ '):
|
if x.endswith('\\ '):
|
||||||
|
@ -384,7 +384,7 @@ def format_date(dt, format, assume_utc=False, as_utc=False):
|
|||||||
|
|
||||||
repl_func = partial(fd_repl_func, dt, 'ap' in format.lower())
|
repl_func = partial(fd_repl_func, dt, 'ap' in format.lower())
|
||||||
return re.sub(
|
return re.sub(
|
||||||
'(s{1,2})|(m{1,2})|(h{1,2})|(ap)|(AP)|(d{1,4}|M{1,4}|(?:yyyy|yy))',
|
r'(s{1,2})|(m{1,2})|(h{1,2})|(ap)|(AP)|(d{1,4}|M{1,4}|(?:yyyy|yy))',
|
||||||
repl_func, format)
|
repl_func, format)
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
@ -460,7 +460,7 @@ def clean_date_for_sort(dt, fmt=None):
|
|||||||
'min':UNDEFINED_DATE.minute, 'sec':UNDEFINED_DATE.second}
|
'min':UNDEFINED_DATE.minute, 'sec':UNDEFINED_DATE.second}
|
||||||
|
|
||||||
repl_func = partial(cd_repl_func, tt, dt)
|
repl_func = partial(cd_repl_func, tt, dt)
|
||||||
re.sub('(s{1,2})|(m{1,2})|(h{1,2})|(d{1,4}|M{1,4}|(?:yyyy|yy))', repl_func, fmt)
|
re.sub(r'(s{1,2})|(m{1,2})|(h{1,2})|(d{1,4}|M{1,4}|(?:yyyy|yy))', repl_func, fmt)
|
||||||
return dt.replace(year=tt['year'], month=tt['mon'], day=tt['day'], hour=tt['hour'],
|
return dt.replace(year=tt['year'], month=tt['mon'], day=tt['day'], hour=tt['hour'],
|
||||||
minute=tt['min'], second=tt['sec'], microsecond=0)
|
minute=tt['min'], second=tt['sec'], microsecond=0)
|
||||||
# }}}
|
# }}}
|
||||||
|
@ -90,7 +90,7 @@ def get_system_locale():
|
|||||||
|
|
||||||
def sanitize_lang(lang):
|
def sanitize_lang(lang):
|
||||||
if lang:
|
if lang:
|
||||||
match = re.match('[a-z]{2,3}(_[A-Z]{2}){0,1}', lang)
|
match = re.match(r'[a-z]{2,3}(_[A-Z]{2}){0,1}', lang)
|
||||||
if match:
|
if match:
|
||||||
lang = match.group()
|
lang = match.group()
|
||||||
if lang == 'zh':
|
if lang == 'zh':
|
||||||
|
@ -195,7 +195,7 @@ class Parser:
|
|||||||
def tokenize(self, expr):
|
def tokenize(self, expr):
|
||||||
# convert docstrings to base64 to avoid all processing. Change the docstring
|
# convert docstrings to base64 to avoid all processing. Change the docstring
|
||||||
# indicator to something unique with no characters special to the parser.
|
# indicator to something unique with no characters special to the parser.
|
||||||
expr = re.sub('(""")(..*?)(""")',
|
expr = re.sub(r'(""")(..*?)(""")',
|
||||||
lambda mo: self.docstring_sep + as_hex_unicode(mo.group(2)) + self.docstring_sep,
|
lambda mo: self.docstring_sep + as_hex_unicode(mo.group(2)) + self.docstring_sep,
|
||||||
expr, flags=re.DOTALL)
|
expr, flags=re.DOTALL)
|
||||||
|
|
||||||
|
@ -1730,7 +1730,7 @@ class BasicNewsRecipe(Recipe):
|
|||||||
|
|
||||||
def error_in_article_download(self, request, traceback):
|
def error_in_article_download(self, request, traceback):
|
||||||
self.jobs_done += 1
|
self.jobs_done += 1
|
||||||
if traceback and re.search('^AbortArticle:', traceback, flags=re.M) is not None:
|
if traceback and re.search(r'^AbortArticle:', traceback, flags=re.M) is not None:
|
||||||
self.log.warn('Aborted download of article:', request.article.title,
|
self.log.warn('Aborted download of article:', request.article.title,
|
||||||
'from', request.article.url)
|
'from', request.article.url)
|
||||||
self.report_progress(float(self.jobs_done)/len(self.jobs),
|
self.report_progress(float(self.jobs_done)/len(self.jobs),
|
||||||
|
@ -59,7 +59,7 @@ def styleFromList(styleName, specArray, spacing, showAllLevels):
|
|||||||
numbered = False
|
numbered = False
|
||||||
displayLevels = 0
|
displayLevels = 0
|
||||||
listStyle = ListStyle(name=styleName)
|
listStyle = ListStyle(name=styleName)
|
||||||
numFormatPattern = re.compile('([1IiAa])')
|
numFormatPattern = re.compile(r'([1IiAa])')
|
||||||
cssLengthPattern = re.compile('([^a-z]+)\\s*([a-z]+)?')
|
cssLengthPattern = re.compile('([^a-z]+)\\s*([a-z]+)?')
|
||||||
m = cssLengthPattern.search(spacing)
|
m = cssLengthPattern.search(spacing)
|
||||||
if (m is not None):
|
if (m is not None):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user