mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-07 09:01:38 -04:00
always use raw-string for regex (extra-edit)
This commit is contained in:
parent
3720de10d2
commit
07500e1ea3
@ -57,8 +57,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://www.birminghammail.co.uk')
|
soup = self.index_to_soup('http://www.birminghammail.co.uk')
|
||||||
cov = soup.find(attrs={'src': re.compile(
|
cov = soup.find(attrs={'src': re.compile(r'http://images.icnetwork.co.uk/upl/birm')})
|
||||||
r'http://images.icnetwork.co.uk/upl/birm')})
|
|
||||||
cov = str(cov)
|
cov = str(cov)
|
||||||
cov2 = re.findall(
|
cov2 = re.findall(
|
||||||
r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
||||||
|
@ -44,7 +44,7 @@ class StandardMediaKeRecipe(BasicNewsRecipe):
|
|||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
import re
|
import re
|
||||||
p = re.compile(r'http://www.standardmedia.co.ke/.*InsidePage.php')
|
p = re.compile(r'http://www.standardmedia.co.ke/.*InsidePage.php')
|
||||||
return p.sub('http://www.standardmedia.co.ke/print.php', url)
|
return p.sub(r'http://www.standardmedia.co.ke/print.php', url)
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
return self.adeify_images(soup)
|
return self.adeify_images(soup)
|
||||||
|
@ -199,12 +199,10 @@ class ZeitEPUBAbo(BasicNewsRecipe):
|
|||||||
# '.*E-Paper.*')) # used to be '.*Abo-Bereich.*'
|
# '.*E-Paper.*')) # used to be '.*Abo-Bereich.*'
|
||||||
# browser.follow_link(abolink)
|
# browser.follow_link(abolink)
|
||||||
# find page for latest issue
|
# find page for latest issue
|
||||||
latestlink = browser.find_link(text_regex=re.compile(
|
latestlink = browser.find_link(text_regex=re.compile(r'.*ZUR AKTUELLEN AUSGABE.*'))
|
||||||
r'.*ZUR AKTUELLEN AUSGABE.*'))
|
|
||||||
browser.follow_link(latestlink)
|
browser.follow_link(latestlink)
|
||||||
# now find the correct file, we will still use the ePub file
|
# now find the correct file, we will still use the ePub file
|
||||||
epublink = browser.find_link(text_regex=re.compile(
|
epublink = browser.find_link(text_regex=re.compile(r'.*EPUB F.*R E-READER LADEN.*')) # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017
|
||||||
r'.*EPUB F.*R E-READER LADEN.*')) # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017
|
|
||||||
response = browser.follow_link(epublink)
|
response = browser.follow_link(epublink)
|
||||||
self.report_progress(1, _('next step'))
|
self.report_progress(1, _('next step'))
|
||||||
|
|
||||||
@ -265,12 +263,10 @@ class ZeitEPUBAbo(BasicNewsRecipe):
|
|||||||
# '.*Abo-Bereich.*'))
|
# '.*Abo-Bereich.*'))
|
||||||
# browser.follow_link(abolink)
|
# browser.follow_link(abolink)
|
||||||
# find page for latest issue
|
# find page for latest issue
|
||||||
latestlink = browser.find_link(text_regex=re.compile(
|
latestlink = browser.find_link(text_regex=re.compile(r'.*ZUR AKTUELLEN AUSGABE.*'))
|
||||||
r'.*ZUR AKTUELLEN AUSGABE.*'))
|
|
||||||
browser.follow_link(latestlink)
|
browser.follow_link(latestlink)
|
||||||
# actual cover search
|
# actual cover search
|
||||||
pdflink = browser.find_link(text_regex=re.compile(
|
pdflink = browser.find_link(text_regex=re.compile(r'.*GESAMT-PDF LADEN.*'))
|
||||||
r'.*GESAMT-PDF LADEN.*'))
|
|
||||||
cover_url = urlparse(pdflink.base_url)[0] + '://' + urlparse(pdflink.base_url)[1] + '' + (
|
cover_url = urlparse(pdflink.base_url)[0] + '://' + urlparse(pdflink.base_url)[1] + '' + (
|
||||||
urlparse(pdflink.url)[2]).replace('ePaper_', '').replace('.pdf', '_001.pdf')
|
urlparse(pdflink.url)[2]).replace('ePaper_', '').replace('.pdf', '_001.pdf')
|
||||||
self.log.warning('PDF link found:')
|
self.log.warning('PDF link found:')
|
||||||
|
@ -228,10 +228,10 @@ per_language_title_sort_articles = {
|
|||||||
# Polish
|
# Polish
|
||||||
'pol': (),
|
'pol': (),
|
||||||
# Italian
|
# Italian
|
||||||
'ita': ('Lo\\s+', 'Il\\s+', "L'", 'L´', 'La\\s+', 'Gli\\s+',
|
'ita': (r'Lo\s+', r'Il\s+', r"L'", r'L´', r'La\s+', r'Gli\s+',
|
||||||
'I\\s+', 'Le\\s+', 'Uno\\s+', 'Un\\s+', 'Una\\s+', "Un'",
|
r'I\s+', r'Le\s+', r'Uno\s+', r'Un\s+', r'Una\s+', "rUn'",
|
||||||
'Un´', 'Dei\\s+', 'Degli\\s+', 'Delle\\s+', 'Del\\s+',
|
r'Un´', r'Dei\s+', r'Degli\s+', r'Delle\s+', r'Del\s+',
|
||||||
'Della\\s+', 'Dello\\s+', "Dell'", 'Dell´'),
|
r'Della\s+', r'Dello\s+', r"Dell'", r'Dell´'),
|
||||||
# Portuguese
|
# Portuguese
|
||||||
'por': (r'A\s+', r'O\s+', r'Os\s+', r'As\s+', r'Um\s+', r'Uns\s+',
|
'por': (r'A\s+', r'O\s+', r'Os\s+', r'As\s+', r'Um\s+', r'Uns\s+',
|
||||||
r'Uma\s+', r'Umas\s+'),
|
r'Uma\s+', r'Umas\s+'),
|
||||||
|
@ -256,10 +256,10 @@ def get_parsed_proxy(typ='http', debug=True):
|
|||||||
proxy = proxies.get(typ, None)
|
proxy = proxies.get(typ, None)
|
||||||
if proxy:
|
if proxy:
|
||||||
pattern = re.compile((
|
pattern = re.compile((
|
||||||
'(?:ptype://)?'
|
r'(?:ptype://)?'
|
||||||
'(?:(?P<user>\\w+):(?P<pass>.*)@)?'
|
r'(?:(?P<user>\w+):(?P<pass>.*)@)?'
|
||||||
'(?P<host>[\\w\\-\\.]+)'
|
r'(?P<host>[\w\-\.]+)'
|
||||||
'(?::(?P<port>\\d+))?').replace('ptype', typ)
|
r'(?::(?P<port>\d+))?').replace('ptype', typ)
|
||||||
)
|
)
|
||||||
|
|
||||||
match = pattern.match(proxies[typ])
|
match = pattern.match(proxies[typ])
|
||||||
|
@ -43,7 +43,7 @@ def html_to_text(root):
|
|||||||
pat = re.compile(r'\n{3,}')
|
pat = re.compile(r'\n{3,}')
|
||||||
for body in root.xpath('h:body', namespaces=XPNSMAP):
|
for body in root.xpath('h:body', namespaces=XPNSMAP):
|
||||||
body.tail = ''
|
body.tail = ''
|
||||||
yield pat.sub('\n\n', ''.join(tag_to_text(body)).strip())
|
yield pat.sub(r'\n\n', ''.join(tag_to_text(body)).strip())
|
||||||
|
|
||||||
|
|
||||||
def to_text(container, name):
|
def to_text(container, name):
|
||||||
|
@ -298,8 +298,8 @@ class RTFInput(InputFormatPlugin):
|
|||||||
# clean multiple \n
|
# clean multiple \n
|
||||||
res = re.sub(br'\n+', b'\n', res)
|
res = re.sub(br'\n+', b'\n', res)
|
||||||
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
||||||
# res = re.sub('\s*<body>', '<body>', res)
|
# res = re.sub(br'\s*<body>', '<body>', res)
|
||||||
# res = re.sub('(?<=\n)\n{2}',
|
# res = re.sub(br'(?<=\n)\n{2}',
|
||||||
# '<p>\u00a0</p>\n'.encode('utf-8'), res)
|
# '<p>\u00a0</p>\n'.encode('utf-8'), res)
|
||||||
f.write(res)
|
f.write(res)
|
||||||
self.write_inline_css(inline_class, border_styles)
|
self.write_inline_css(inline_class, border_styles)
|
||||||
|
@ -274,7 +274,7 @@ class Dehyphenator:
|
|||||||
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
|
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
|
||||||
elif format == 'txt':
|
elif format == 'txt':
|
||||||
intextmatch = re.compile(
|
intextmatch = re.compile(
|
||||||
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)( |\t)*(?P<wraptags>(\n( |\t)*)+)(?P<secondpart>[\\w\\d]+)'% length)
|
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)( |\t)*(?P<wraptags>(\n( |\t)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||||
elif format == 'individual_words':
|
elif format == 'individual_words':
|
||||||
intextmatch = re.compile(
|
intextmatch = re.compile(
|
||||||
r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
|
r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
|
||||||
|
@ -35,11 +35,11 @@ class HeuristicProcessor:
|
|||||||
self.line_open = (
|
self.line_open = (
|
||||||
r'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*'
|
r'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*'
|
||||||
r'(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*')
|
r'(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*')
|
||||||
self.line_close = '(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>'
|
self.line_close = r'(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>'
|
||||||
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
|
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
|
||||||
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
|
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
|
||||||
self.common_in_text_endings = '["\'—’”,\\.!\\?\\…\\)„\\w]'
|
self.common_in_text_endings = r'["\'—’”,\.!\?\…\)„\w]'
|
||||||
self.common_in_text_beginnings = '[\\w\'"“‘‛]'
|
self.common_in_text_beginnings = r'[\w\'"“‘‛]'
|
||||||
|
|
||||||
def is_pdftohtml(self, src):
|
def is_pdftohtml(self, src):
|
||||||
return "<!-- created by calibre's pdftohtml -->" in src[:1000]
|
return "<!-- created by calibre's pdftohtml -->" in src[:1000]
|
||||||
@ -59,8 +59,8 @@ class HeuristicProcessor:
|
|||||||
else:
|
else:
|
||||||
delete_whitespace = re.compile(r'^\s*(?P<c>.*?)\s*$')
|
delete_whitespace = re.compile(r'^\s*(?P<c>.*?)\s*$')
|
||||||
delete_quotes = re.compile(r'\'"')
|
delete_quotes = re.compile(r'\'"')
|
||||||
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
|
txt_chap = delete_quotes.sub('', delete_whitespace.sub(r'\g<c>', html2text(chap)))
|
||||||
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
|
txt_title = delete_quotes.sub('', delete_whitespace.sub(r'\g<c>', html2text(title)))
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log.debug('marked ' + str(self.html_preprocess_sections) +
|
self.log.debug('marked ' + str(self.html_preprocess_sections) +
|
||||||
' chapters & titles. - ' + str(chap) + ', ' + str(title))
|
' chapters & titles. - ' + str(chap) + ', ' + str(title))
|
||||||
@ -214,30 +214,30 @@ class HeuristicProcessor:
|
|||||||
self.log.debug('found ' + str(self.html_preprocess_sections) + ' pre-existing headings')
|
self.log.debug('found ' + str(self.html_preprocess_sections) + ' pre-existing headings')
|
||||||
|
|
||||||
# Build the Regular Expressions in pieces
|
# Build the Regular Expressions in pieces
|
||||||
init_lookahead = '(?=<(p|div))'
|
init_lookahead = r'(?=<(p|div))'
|
||||||
chapter_line_open = self.line_open
|
chapter_line_open = self.line_open
|
||||||
title_line_open = (r'<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?'
|
title_line_open = (r'<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?'
|
||||||
r'\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*')
|
r'\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*')
|
||||||
chapter_header_open = r'(?P<chap>'
|
chapter_header_open = r'(?P<chap>'
|
||||||
title_header_open = r'(?P<title>'
|
title_header_open = r'(?P<title>'
|
||||||
chapter_header_close = ')\\s*'
|
chapter_header_close = r')\s*'
|
||||||
title_header_close = ')'
|
title_header_close = r')'
|
||||||
chapter_line_close = self.line_close
|
chapter_line_close = self.line_close
|
||||||
title_line_close = '(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)>)?\\s*</(?P=outer2)>'
|
title_line_close = r'(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>'
|
||||||
|
|
||||||
is_pdftohtml = self.is_pdftohtml(html)
|
is_pdftohtml = self.is_pdftohtml(html)
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
title_line_open = '<(?P<outer2>p)[^>]*>\\s*'
|
title_line_open = r'<(?P<outer2>p)[^>]*>\s*'
|
||||||
title_line_close = '\\s*</(?P=outer2)>'
|
title_line_close = r'\s*</(?P=outer2)>'
|
||||||
|
|
||||||
if blanks_between_paragraphs:
|
if blanks_between_paragraphs:
|
||||||
blank_lines = '(\\s*<p[^>]*>\\s*</p>){0,2}\\s*'
|
blank_lines = r'(\s*<p[^>]*>\s*</p>){0,2}\s*'
|
||||||
else:
|
else:
|
||||||
blank_lines = ''
|
blank_lines = ''
|
||||||
opt_title_open = '('
|
opt_title_open = r'('
|
||||||
opt_title_close = ')?'
|
opt_title_close = r')?'
|
||||||
n_lookahead_open = '(?!\\s*'
|
n_lookahead_open = r'(?!\s*'
|
||||||
n_lookahead_close = ')\\s*'
|
n_lookahead_close = r')\s*'
|
||||||
|
|
||||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||||
simple_title = r'(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)'
|
simple_title = r'(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)'
|
||||||
@ -369,10 +369,10 @@ class HeuristicProcessor:
|
|||||||
lookahead = '(?<=.{'+str(length)+r'}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýźâêîôûçąężłıãõñæøþðßěľščťžňďřůёђєіїјљњћўџѣа-я,:)\\IAß]|(?<!\&\w{4});))'
|
lookahead = '(?<=.{'+str(length)+r'}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýźâêîôûçąężłıãõñæøþðßěľščťžňďřůёђєіїјљњћўџѣа-я,:)\\IAß]|(?<!\&\w{4});))'
|
||||||
em_en_lookahead = '(?<=.{'+str(length)+'}[\u2013\u2014])'
|
em_en_lookahead = '(?<=.{'+str(length)+'}[\u2013\u2014])'
|
||||||
soft_hyphen = '\xad'
|
soft_hyphen = '\xad'
|
||||||
line_ending = '\\s*(?P<style_close></(span|[iub])>)?\\s*(</(p|div)>)?'
|
line_ending = r'\s*(?P<style_close></(span|[iub])>)?\s*(</(p|div)>)?'
|
||||||
blanklines = '\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*'
|
blanklines = r'\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*'
|
||||||
line_opening = '<(p|div)[^>]*>\\s*(?P<style_open><(span|[iub])[^>]*>)?\\s*'
|
line_opening = r'<(p|div)[^>]*>\s*(?P<style_open><(span|[iub])[^>]*>)?\s*'
|
||||||
txt_line_wrap = '((\u0020|\u0009)*\n){1,4}'
|
txt_line_wrap = r'((\u0020|\u0009)*\n){1,4}'
|
||||||
|
|
||||||
if format == 'txt':
|
if format == 'txt':
|
||||||
unwrap_regex = lookahead+txt_line_wrap
|
unwrap_regex = lookahead+txt_line_wrap
|
||||||
@ -418,12 +418,12 @@ class HeuristicProcessor:
|
|||||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||||
# other types of unmarked html and handle them in some better fashion
|
# other types of unmarked html and handle them in some better fashion
|
||||||
add_markup = re.compile(r'(?<!>)(\n)')
|
add_markup = re.compile(r'(?<!>)(\n)')
|
||||||
html = add_markup.sub('</p>\n<p>', html)
|
html = add_markup.sub(r'</p>\n<p>', html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def arrange_htm_line_endings(self, html):
|
def arrange_htm_line_endings(self, html):
|
||||||
html = re.sub(r'\s*</(?P<tag>p|div)>', '</'+'\\g<tag>'+'>\n', html)
|
html = re.sub(r'\s*</(?P<tag>p|div)>', r'</\g<tag>'+'>\n', html)
|
||||||
html = re.sub(r'\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*', '\n<'+'\\g<tag>'+'\\g<style>'+'>', html)
|
html = re.sub(r'\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*', r'\n<\g<tag>\g<style>'+'>', html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def fix_nbsp_indents(self, html):
|
def fix_nbsp_indents(self, html):
|
||||||
@ -546,7 +546,7 @@ class HeuristicProcessor:
|
|||||||
|
|
||||||
def markup_whitespaces(match):
|
def markup_whitespaces(match):
|
||||||
blanks = match.group(0)
|
blanks = match.group(0)
|
||||||
blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
|
blanks = self.blankreg.sub(r'\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
|
||||||
return blanks
|
return blanks
|
||||||
|
|
||||||
html = blanks_n_nopunct.sub(markup_whitespaces, html)
|
html = blanks_n_nopunct.sub(markup_whitespaces, html)
|
||||||
@ -556,10 +556,10 @@ class HeuristicProcessor:
|
|||||||
return html
|
return html
|
||||||
|
|
||||||
def detect_soft_breaks(self, html):
|
def detect_soft_breaks(self, html):
|
||||||
line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')'
|
line = '(?P<initline>'+self.line_open+r'\s*(?P<init_content>.*?)'+self.line_close+')'
|
||||||
line_two = '(?P<line_two>'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_open)+ \
|
line_two = '(?P<line_two>'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_open)+ \
|
||||||
'\\s*(?P<line_two_content>.*?)'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_close)+')'
|
r'\s*(?P<line_two_content>.*?)'+re.sub(r'(ou|in|cha)', 'linetwo_', self.line_close)+')'
|
||||||
div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two
|
div_break_candidate_pattern = line+r'\s*<div[^>]*>\s*</div>\s*'+line_two
|
||||||
div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
|
div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
|
||||||
|
|
||||||
def convert_div_softbreaks(match):
|
def convert_div_softbreaks(match):
|
||||||
@ -575,16 +575,16 @@ class HeuristicProcessor:
|
|||||||
html = div_break_candidate.sub(convert_div_softbreaks, html)
|
html = div_break_candidate.sub(convert_div_softbreaks, html)
|
||||||
|
|
||||||
if not self.blanks_deleted and self.blanks_between_paragraphs:
|
if not self.blanks_deleted and self.blanks_between_paragraphs:
|
||||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
|
html = self.multi_blank.sub(r'\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||||
else:
|
else:
|
||||||
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
html = self.blankreg.sub(r'\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def detect_scene_breaks(self, html):
|
def detect_scene_breaks(self, html):
|
||||||
scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+ \
|
scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+ \
|
||||||
'<))(?P<break>((?P<break_char>((?!\\s)\\W))\\s*(?P=break_char)?){1,10})\\s*'+self.line_close
|
r'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?){1,10})\s*'+self.line_close
|
||||||
scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
|
scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
|
||||||
html = scene_breaks.sub(self.scene_break_open+'\\g<break>'+'</p>', html)
|
html = scene_breaks.sub(self.scene_break_open+r'\g<break></p>', html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def markup_user_break(self, replacement_break):
|
def markup_user_break(self, replacement_break):
|
||||||
@ -768,11 +768,11 @@ class HeuristicProcessor:
|
|||||||
|
|
||||||
is_pdftohtml = self.is_pdftohtml(html)
|
is_pdftohtml = self.is_pdftohtml(html)
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
self.line_open = '<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*'
|
self.line_open = r'<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*'
|
||||||
self.line_close = '\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>'
|
self.line_close = r'\s*(</[ibu][^>]*>\s*)?</(?P=outer)>'
|
||||||
|
|
||||||
# ADE doesn't render <br />, change to empty paragraphs
|
# ADE doesn't render <br />, change to empty paragraphs
|
||||||
# html = re.sub('<br[^>]*>', '<p>\u00a0</p>', html)
|
# html = re.sub(r'<br[^>]*>', '<p>\u00a0</p>', html)
|
||||||
|
|
||||||
# Determine whether the document uses interleaved blank lines
|
# Determine whether the document uses interleaved blank lines
|
||||||
self.blanks_between_paragraphs = self.analyze_blanks(html)
|
self.blanks_between_paragraphs = self.analyze_blanks(html)
|
||||||
@ -791,7 +791,7 @@ class HeuristicProcessor:
|
|||||||
if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||||
self.log.debug('deleting blank lines')
|
self.log.debug('deleting blank lines')
|
||||||
self.blanks_deleted = True
|
self.blanks_deleted = True
|
||||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
html = self.multi_blank.sub(r'\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||||
html = self.blankreg.sub('', html)
|
html = self.blankreg.sub('', html)
|
||||||
|
|
||||||
# Determine line ending type
|
# Determine line ending type
|
||||||
@ -845,7 +845,7 @@ class HeuristicProcessor:
|
|||||||
# headings and titles, images, etc
|
# headings and titles, images, etc
|
||||||
doubleheading = re.compile(
|
doubleheading = re.compile(
|
||||||
r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
html = doubleheading.sub('\\g<firsthead>'+'\n<h3'+'\\g<secondhead>'+'</h3>', html)
|
html = doubleheading.sub(r'\g<firsthead>\n<h3\g<secondhead></h3>', html)
|
||||||
|
|
||||||
# If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
|
# If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
|
||||||
# style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks.
|
# style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks.
|
||||||
@ -876,5 +876,5 @@ class HeuristicProcessor:
|
|||||||
|
|
||||||
if self.deleted_nbsps:
|
if self.deleted_nbsps:
|
||||||
# put back non-breaking spaces in empty paragraphs so they render correctly
|
# put back non-breaking spaces in empty paragraphs so they render correctly
|
||||||
html = self.anyblank.sub('\n'+r'\g<openline>'+'\u00a0'+r'\g<closeline>', html)
|
html = self.anyblank.sub(r'\n\g<openline>\u00a0\g<closeline>', html)
|
||||||
return html
|
return html
|
||||||
|
@ -108,7 +108,7 @@ class HTMLConverter:
|
|||||||
lambda match: '<a'+match.group(1)+'></a>'),
|
lambda match: '<a'+match.group(1)+'></a>'),
|
||||||
# Strip comments from <style> tags. This is needed as
|
# Strip comments from <style> tags. This is needed as
|
||||||
# sometimes there are unterminated comments
|
# sometimes there are unterminated comments
|
||||||
(re.compile(r'<\s*style.*?>(.*?)<\/\s*style\s*>', re.DOTALL|re.IGNORECASE),
|
(re.compile(r'<\s*style.*?>(.*?)</\s*style\s*>', re.DOTALL|re.IGNORECASE),
|
||||||
lambda match: match.group().replace('<!--', '').replace('-->', '')),
|
lambda match: match.group().replace('<!--', '').replace('-->', '')),
|
||||||
# remove <p> tags from within <a href> tags
|
# remove <p> tags from within <a href> tags
|
||||||
(re.compile(r'<\s*a\s+[^<>]*href\s*=[^<>]*>(.*?)<\s*/\s*a\s*>', re.DOTALL|re.IGNORECASE),
|
(re.compile(r'<\s*a\s+[^<>]*href\s*=[^<>]*>(.*?)<\s*/\s*a\s*>', re.DOTALL|re.IGNORECASE),
|
||||||
|
@ -724,8 +724,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
# remove all attributes from tags
|
# remove all attributes from tags
|
||||||
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
|
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
|
||||||
# Collapse whitespace
|
# Collapse whitespace
|
||||||
# desc = re.sub('\n+', '\n', desc)
|
# desc = re.sub(r'\n+', '\n', desc)
|
||||||
# desc = re.sub(' +', ' ', desc)
|
# desc = re.sub(r' +', ' ', desc)
|
||||||
# Remove the notice about text referring to out of print editions
|
# Remove the notice about text referring to out of print editions
|
||||||
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
|
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
|
||||||
# Remove comments
|
# Remove comments
|
||||||
|
@ -120,8 +120,8 @@ class Worker(Thread): # {{{
|
|||||||
# remove all attributes from tags
|
# remove all attributes from tags
|
||||||
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
|
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
|
||||||
# Collapse whitespace
|
# Collapse whitespace
|
||||||
# desc = re.sub('\n+', '\n', desc)
|
# desc = re.sub(r'\n+', '\n', desc)
|
||||||
# desc = re.sub(' +', ' ', desc)
|
# desc = re.sub(r' +', ' ', desc)
|
||||||
# Remove comments
|
# Remove comments
|
||||||
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
||||||
return sanitize_comments_html(desc)
|
return sanitize_comments_html(desc)
|
||||||
|
@ -368,17 +368,17 @@ class MobiReader:
|
|||||||
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
||||||
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
|
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
|
||||||
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
|
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
|
||||||
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html)
|
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', '', self.processed_html)
|
||||||
# Swap inline and block level elements, and order block level elements according to priority
|
# Swap inline and block level elements, and order block level elements according to priority
|
||||||
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
|
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
|
||||||
self.processed_html = re.sub(
|
self.processed_html = re.sub(
|
||||||
r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>'+r'\g<styletags>', self.processed_html)
|
r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>\g<styletags>', self.processed_html)
|
||||||
self.processed_html = re.sub(
|
self.processed_html = re.sub(
|
||||||
r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>'+r'\g<para>', self.processed_html)
|
r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>\g<para>', self.processed_html)
|
||||||
self.processed_html = re.sub(
|
self.processed_html = re.sub(
|
||||||
r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>'+r'\g<blockquote>', self.processed_html)
|
r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>\g<blockquote>', self.processed_html)
|
||||||
self.processed_html = re.sub(
|
self.processed_html = re.sub(
|
||||||
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html)
|
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>\g<para>', self.processed_html)
|
||||||
bods = htmls = 0
|
bods = htmls = 0
|
||||||
for x in re.finditer(r'</body>|</html>', self.processed_html):
|
for x in re.finditer(r'</body>|</html>', self.processed_html):
|
||||||
if x == '</body>':
|
if x == '</body>':
|
||||||
|
@ -24,12 +24,12 @@ REGEXES = {
|
|||||||
'positiveRe': re.compile(r'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
|
'positiveRe': re.compile(r'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
|
||||||
'negativeRe': re.compile(r'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I), # noqa: E501
|
'negativeRe': re.compile(r'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I), # noqa: E501
|
||||||
'divToPElementsRe': re.compile(r'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
|
'divToPElementsRe': re.compile(r'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
|
||||||
# 'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
|
# 'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
|
||||||
# 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
|
# 'replaceFontsRe': re.compile(r'<(/?)font[^>]*>',re.I),
|
||||||
# 'trimRe': re.compile('^\s+|\s+$/'),
|
# 'trimRe': re.compile(r'^\s+|\s+$/'),
|
||||||
# 'normalizeRe': re.compile('\s{2,}/'),
|
# 'normalizeRe': re.compile(r'\s{2,}/'),
|
||||||
# 'killBreaksRe': re.compile('(<br\s*\/?>(\s| ?)*){1,}/'),
|
# 'killBreaksRe': re.compile(r'(<br\s*/?>(\s| ?)*){1,}/'),
|
||||||
# 'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
|
# 'videoRe': re.compile(r'http://(www\.)?(youtube|vimeo)\.com', re.I),
|
||||||
# skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
|
# skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -195,7 +195,7 @@ class RTFMLizer:
|
|||||||
|
|
||||||
def clean_text(self, text):
|
def clean_text(self, text):
|
||||||
# Remove excessive newlines
|
# Remove excessive newlines
|
||||||
text = re.sub('%s{3,}' % os.linesep, f'{os.linesep}{os.linesep}', text)
|
text = re.sub(r'%s{3,}' % os.linesep, f'{os.linesep}{os.linesep}', text)
|
||||||
|
|
||||||
# Remove excessive spaces
|
# Remove excessive spaces
|
||||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||||
|
@ -156,10 +156,10 @@ class SNBMLizer:
|
|||||||
text = text.replace('\f+', ' ')
|
text = text.replace('\f+', ' ')
|
||||||
|
|
||||||
# Single line paragraph.
|
# Single line paragraph.
|
||||||
text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
|
text = re.sub(r'(?<=.)%s(?=.)' % os.linesep, ' ', text)
|
||||||
|
|
||||||
# Remove multiple spaces.
|
# Remove multiple spaces.
|
||||||
# text = re.sub('[ ]{2,}', ' ', text)
|
# text = re.sub(r'[ ]{2,}', ' ', text)
|
||||||
|
|
||||||
# Remove excessive newlines.
|
# Remove excessive newlines.
|
||||||
text = re.sub(r'\n[ ]+\n', '\n\n', text)
|
text = re.sub(r'\n[ ]+\n', '\n\n', text)
|
||||||
|
@ -760,7 +760,7 @@ class Textile:
|
|||||||
'''
|
'''
|
||||||
what is this for?
|
what is this for?
|
||||||
'''
|
'''
|
||||||
pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http(?:s?):\/\/|\/)\S+)(?=\s|$)', re.U)
|
pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http(?:s?)://|/)\S+)(?=\s|$)', re.U)
|
||||||
text = pattern.sub(self.refs, text)
|
text = pattern.sub(self.refs, text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
@ -119,8 +119,8 @@ def unsmarten(txt):
|
|||||||
txt = re.sub(r'♦|♦|♦', r'{diamond}', txt) # diamond
|
txt = re.sub(r'♦|♦|♦', r'{diamond}', txt) # diamond
|
||||||
|
|
||||||
# Move into main code?
|
# Move into main code?
|
||||||
# txt = re.sub('\xa0', r'p. ', txt) # blank paragraph
|
# txt = re.sub(r'\xa0', r'p. ', txt) # blank paragraph
|
||||||
# txt = re.sub('\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph
|
# txt = re.sub(r'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph
|
||||||
# txt = re.sub('\n \n', r'\n<br />\n', txt) # blank paragraph - br tag
|
# txt = re.sub(r'\n \n', r'\n<br />\n', txt) # blank paragraph - br tag
|
||||||
|
|
||||||
return txt
|
return txt
|
||||||
|
@ -222,7 +222,7 @@ def remove_indents(txt):
|
|||||||
'''
|
'''
|
||||||
Remove whitespace at the beginning of each line.
|
Remove whitespace at the beginning of each line.
|
||||||
'''
|
'''
|
||||||
return re.sub(r'^[\r\t\f\v ]+', r'', txt, flags=re.MULTILINE)
|
return re.sub(r'^[\r\t\f\v ]+', '', txt, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
|
||||||
def opf_writer(path, opf_name, manifest, spine, mi):
|
def opf_writer(path, opf_name, manifest, spine, mi):
|
||||||
|
@ -83,7 +83,7 @@ class TextileMLizer(OEB2HTML):
|
|||||||
text = re.sub(r'"(.+)":'+i+r'(\s)', r'\1\2', text)
|
text = re.sub(r'"(.+)":'+i+r'(\s)', r'\1\2', text)
|
||||||
for i in self.our_ids:
|
for i in self.our_ids:
|
||||||
if i not in self.our_links:
|
if i not in self.our_links:
|
||||||
text = re.sub(r'%?\('+i+'\\)\xa0?%?', r'', text)
|
text = re.sub(r'%?\('+i+'\\)\xa0?%?', '', text)
|
||||||
|
|
||||||
# Remove obvious non-needed escaping, add sub/sup-script ones
|
# Remove obvious non-needed escaping, add sub/sup-script ones
|
||||||
text = check_escaping(text, [r'\*', '_', r'\*'])
|
text = check_escaping(text, [r'\*', '_', r'\*'])
|
||||||
@ -101,7 +101,7 @@ class TextileMLizer(OEB2HTML):
|
|||||||
# remove spaces before a newline
|
# remove spaces before a newline
|
||||||
text = re.sub(r' +\n', r'\n', text)
|
text = re.sub(r' +\n', r'\n', text)
|
||||||
# remove newlines at top of file
|
# remove newlines at top of file
|
||||||
text = re.sub(r'^\n+', r'', text)
|
text = re.sub(r'^\n+', '', text)
|
||||||
# correct blockcode paras
|
# correct blockcode paras
|
||||||
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
|
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
|
||||||
# correct blockquote paras
|
# correct blockquote paras
|
||||||
|
@ -31,7 +31,7 @@ def comparable_price(text):
|
|||||||
m = re.sub(r'[.,\' ]', '.', match.group())
|
m = re.sub(r'[.,\' ]', '.', match.group())
|
||||||
# remove all separators accept fraction,
|
# remove all separators accept fraction,
|
||||||
# leave only 2 digits in fraction
|
# leave only 2 digits in fraction
|
||||||
m = re.sub(r'\.(?!\d*$)', r'', m)
|
m = re.sub(r'\.(?!\d*$)', '', m)
|
||||||
text = f'{float(m) * 100.:0>8.0f}'
|
text = f'{float(m) * 100.:0>8.0f}'
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
@ -248,7 +248,7 @@ class SearchDialog(QDialog, Ui_Dialog):
|
|||||||
query = re.sub(r'%s:"[^"]"' % loc, '', query)
|
query = re.sub(r'%s:"[^"]"' % loc, '', query)
|
||||||
query = re.sub(r'%s:[^\s]*' % loc, '', query)
|
query = re.sub(r'%s:[^\s]*' % loc, '', query)
|
||||||
# Remove logic.
|
# Remove logic.
|
||||||
query = re.sub(r'(^|\s|")(and|not|or|a|the|is|of)(\s|$|")', r' ', query)
|
query = re.sub(r'(^|\s|")(and|not|or|a|the|is|of)(\s|$|")', ' ', query)
|
||||||
# Remove "
|
# Remove "
|
||||||
query = query.replace('"', '')
|
query = query.replace('"', '')
|
||||||
# Remove excess whitespace.
|
# Remove excess whitespace.
|
||||||
|
@ -80,7 +80,7 @@ class EmpikStore(BasicStoreConfig, StorePlugin):
|
|||||||
# with closing(br.open('https://empik.com' + id.strip(), timeout=timeout/4)) as nf:
|
# with closing(br.open('https://empik.com' + id.strip(), timeout=timeout/4)) as nf:
|
||||||
# idata = html.fromstring(nf.read())
|
# idata = html.fromstring(nf.read())
|
||||||
# crawled = idata.xpath('.//a[(@class="chosen hrefstyle") or (@class="connectionsLink hrefstyle")]/text()')
|
# crawled = idata.xpath('.//a[(@class="chosen hrefstyle") or (@class="connectionsLink hrefstyle")]/text()')
|
||||||
# formats = ','.join([re.sub('ebook, ','', x.strip()) for x in crawled if 'ebook' in x])
|
# formats = ','.join([re.sub(r'ebook, ','', x.strip()) for x in crawled if 'ebook' in x])
|
||||||
|
|
||||||
counter -= 1
|
counter -= 1
|
||||||
|
|
||||||
|
@ -160,7 +160,7 @@ class CSV_XML(CatalogPlugin):
|
|||||||
if isinstance(item, str):
|
if isinstance(item, str):
|
||||||
opening_tag = re.search(r'<(\w+)( |>)', item)
|
opening_tag = re.search(r'<(\w+)( |>)', item)
|
||||||
if opening_tag:
|
if opening_tag:
|
||||||
closing_tag = re.search(r'<\/%s>$' % opening_tag.group(1), item)
|
closing_tag = re.search(r'</%s>$' % opening_tag.group(1), item)
|
||||||
if closing_tag:
|
if closing_tag:
|
||||||
item = html2text(item)
|
item = html2text(item)
|
||||||
|
|
||||||
|
@ -514,9 +514,9 @@ def set_note(ctx, rd, field, item_id, library_id):
|
|||||||
db_replacements[key] = f'{RESOURCE_URL_SCHEME}://{scheme}/{digest}'
|
db_replacements[key] = f'{RESOURCE_URL_SCHEME}://{scheme}/{digest}'
|
||||||
db_html = srv_html = html
|
db_html = srv_html = html
|
||||||
if db_replacements:
|
if db_replacements:
|
||||||
db_html = re.sub('|'.join(map(re.escape, db_replacements)), lambda m: db_replacements[m.group()], html)
|
db_html = re.sub(r'|'.join(map(re.escape, db_replacements)), lambda m: db_replacements[m.group()], html)
|
||||||
if srv_replacements:
|
if srv_replacements:
|
||||||
srv_html = re.sub('|'.join(map(re.escape, srv_replacements)), lambda m: srv_replacements[m.group()], html)
|
srv_html = re.sub(r'|'.join(map(re.escape, srv_replacements)), lambda m: srv_replacements[m.group()], html)
|
||||||
db.set_notes_for(field, item_id, db_html, searchable_text, resources)
|
db.set_notes_for(field, item_id, db_html, searchable_text, resources)
|
||||||
rd.outheaders['Content-Type'] = 'text/html; charset=UTF-8'
|
rd.outheaders['Content-Type'] = 'text/html; charset=UTF-8'
|
||||||
return srv_html
|
return srv_html
|
||||||
|
@ -124,7 +124,7 @@ def default_is_link_wanted(url, tag):
|
|||||||
|
|
||||||
class RecursiveFetcher:
|
class RecursiveFetcher:
|
||||||
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
||||||
('.exe\\s*$', '.mp3\\s*$', '.ogg\\s*$', '^\\s*mailto:', '^\\s*$'))
|
(r'.exe\s*$', r'.mp3\s*$', r'.ogg\s*$', r'^\s*mailto:', r'^\s*$'))
|
||||||
# ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
|
# ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
|
||||||
# (
|
# (
|
||||||
#
|
#
|
||||||
|
Loading…
x
Reference in New Issue
Block a user