decode unicode-escape (extra-edit)

This commit is contained in:
un-pogaz 2025-01-24 11:14:16 +01:00
parent 12cb8b2e58
commit 534293eabc
17 changed files with 87 additions and 87 deletions

View File

@ -299,8 +299,8 @@ class iPadOutput(OutputProfile):
} }
] ]
ratings_char = '\u2605' # filled star ratings_char = '' # filled star
empty_ratings_char = '\u2606' # hollow star empty_ratings_char = '' # hollow star
touchscreen = True touchscreen = True
# touchscreen_news_css {{{ # touchscreen_news_css {{{
@ -677,8 +677,8 @@ class KindleOutput(OutputProfile):
supports_mobi_indexing = True supports_mobi_indexing = True
periodical_date_in_title = False periodical_date_in_title = False
empty_ratings_char = '\u2606' empty_ratings_char = ''
ratings_char = '\u2605' ratings_char = ''
mobi_ems_per_blockquote = 2.0 mobi_ems_per_blockquote = 2.0
@ -696,8 +696,8 @@ class KindleDXOutput(OutputProfile):
# comic_screen_size = (741, 1022) # comic_screen_size = (741, 1022)
supports_mobi_indexing = True supports_mobi_indexing = True
periodical_date_in_title = False periodical_date_in_title = False
empty_ratings_char = '\u2606' empty_ratings_char = ''
ratings_char = '\u2605' ratings_char = ''
mobi_ems_per_blockquote = 2.0 mobi_ems_per_blockquote = 2.0

View File

@ -77,7 +77,7 @@ def _get_comments(soup):
pages = (_metadata_from_span(soup, 'pages') or _metadata_from_table(soup, 'pages')) pages = (_metadata_from_span(soup, 'pages') or _metadata_from_table(soup, 'pages'))
try: try:
# date span can have copyright symbols in it... # date span can have copyright symbols in it...
date = date.replace('\u00a9', '').strip() date = date.replace('©', '').strip()
# and pages often comes as '(\d+ pages)' # and pages often comes as '(\d+ pages)'
pages = re.search(r'\d+', pages).group(0) pages = re.search(r'\d+', pages).group(0)
return f'Published {date}, {pages} pages.' return f'Published {date}, {pages} pages.'

View File

@ -19,20 +19,20 @@ XLINK_NS = 'http://www.w3.org/1999/xlink'
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE) _span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
LIGATURES = { LIGATURES = {
# '\u00c6': 'AE', # 'Æ': 'AE',
# '\u00e6': 'ae', # 'æ': 'ae',
# '\u0152': 'OE', # 'Œ': 'OE',
# '\u0153': 'oe', # 'œ': 'oe',
# '\u0132': 'IJ', # 'IJ': 'IJ',
# '\u0133': 'ij', # 'ij': 'ij',
# '\u1D6B': 'ue', # '': 'ue',
'\uFB00': 'ff', '': 'ff',
'\uFB01': 'fi', '': 'fi',
'\uFB02': 'fl', '': 'fl',
'\uFB03': 'ffi', '': 'ffi',
'\uFB04': 'ffl', '': 'ffl',
'\uFB05': 'ft', '': 'ft',
'\uFB06': 'st', '': 'st',
} }
_ligpat = re.compile('|'.join(LIGATURES)) _ligpat = re.compile('|'.join(LIGATURES))
@ -240,7 +240,7 @@ class Dehyphenator:
else: else:
if self.verbose > 2: if self.verbose > 2:
self.log(' Cleanup:returning original text '+firsthalf+' + linefeed '+secondhalf) self.log(' Cleanup:returning original text '+firsthalf+' + linefeed '+secondhalf)
return firsthalf+'\u2014'+wraptags+secondhalf return firsthalf+''+wraptags+secondhalf
else: else:
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6: if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
@ -274,7 +274,7 @@ class Dehyphenator:
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length) r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
elif format == 'txt': elif format == 'txt':
intextmatch = re.compile( intextmatch = re.compile(
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length) '(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|)( |\t)*(?P<wraptags>(\n( |\t)*)+)(?P<secondpart>[\\w\\d]+)'% length)
elif format == 'individual_words': elif format == 'individual_words':
intextmatch = re.compile( intextmatch = re.compile(
r'(?!<)(?P<firstpart>[^\W\-]+)(-|)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE) r'(?!<)(?P<firstpart>[^\W\-]+)(-|)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)

View File

@ -2194,8 +2194,8 @@ def find_tests():
t('a&amp;b&lt;c', 'a&b<c') t('a&amp;b&lt;c', 'a&b<c')
t('a&acE;b', 'a∾̳b') t('a&acE;b', 'a∾̳b')
t('a&#1234;b', 'aӒb') t('a&#1234;b', 'aӒb')
t('a&#X1234;b', 'a\u1234b') t('a&#X1234;b', 'ab')
t('a&#x1034fA;b', 'a\U001034fAb') t('a&#x1034fA;b', 'a\U001034FAb')
t('a&#0;b&#x000;c', 'abc') t('a&#0;b&#x000;c', 'abc')
x('&amp;&lt;&gt;&apos;&quot;', '&amp;&lt;&gt;&apos;&quot;') x('&amp;&lt;&gt;&apos;&quot;', '&amp;&lt;&gt;&apos;&quot;')

View File

@ -803,7 +803,7 @@ class HTMLConverter:
src = src.lstrip() src = src.lstrip()
f = src[0] f = src[0]
next = 1 next = 1
if f in ("'", '"', '\u201c', '\u2018', '\u201d', '\u2019'): if f in ("'", '"', '', '', '', ''):
if len(src) >= 2: if len(src) >= 2:
next = 2 next = 2
f = src[:2] f = src[:2]
@ -819,7 +819,7 @@ class HTMLConverter:
def append_text(src): def append_text(src):
fp, key, variant = self.font_properties(css) fp, key, variant = self.font_properties(css)
for x, y in [('\xad', ''), ('\xa0', ' '), ('\ufb00', 'ff'), ('\ufb01', 'fi'), ('\ufb02', 'fl'), ('\ufb03', 'ffi'), ('\ufb04', 'ffl')]: for x, y in [('\xad', ''), ('\xa0', ' '), ('', 'ff'), ('', 'fi'), ('', 'fl'), ('', 'ffi'), ('', 'ffl')]:
src = src.replace(x, y) src = src.replace(x, y)
def valigner(x): def valigner(x):
@ -1624,7 +1624,7 @@ class HTMLConverter:
in_ol = parent.name.lower() == 'ol' in_ol = parent.name.lower() == 'ol'
break break
parent = parent.parent parent = parent.parent
prepend = str(self.list_counter)+'. ' if in_ol else '\u2022' + ' ' prepend = str(self.list_counter)+'. ' if in_ol else '' + ' '
self.current_para.append(Span(prepend)) self.current_para.append(Span(prepend))
self.process_children(tag, tag_css, tag_pseudo_css) self.process_children(tag, tag_css, tag_pseudo_css)
if in_ol: if in_ol:

View File

@ -529,11 +529,11 @@ class MobiMLizer:
t = elem.text t = elem.text
if not t: if not t:
t = '' t = ''
elem.text = '\u201c' + t elem.text = '' + t
t = elem.tail t = elem.tail
if not t: if not t:
t = '' t = ''
elem.tail = '\u201d' + t elem.tail = '' + t
text = None text = None
if elem.text: if elem.text:
if istate.preserve or istate.pre_wrap: if istate.preserve or istate.pre_wrap:

View File

@ -362,7 +362,7 @@ class Serializer:
text = text.replace('&', '&amp;') text = text.replace('&', '&amp;')
text = text.replace('<', '&lt;') text = text.replace('<', '&lt;')
text = text.replace('>', '&gt;') text = text.replace('>', '&gt;')
text = text.replace('\u00AD', '') # Soft-hyphen text = text.replace('\u00ad', '') # Soft-hyphen
if quot: if quot:
text = text.replace('"', '&quot;') text = text.replace('"', '&quot;')
if isinstance(text, str): if isinstance(text, str):

View File

@ -100,9 +100,9 @@ def adjacent_quotes(first_string, second_string):
if fchar is not None: if fchar is not None:
first_char = fchar.group(1) # First non-space char first_char = fchar.group(1) # First non-space char
return bool((last_char == '\u0022' and first_char == '\u0022') \ return bool((last_char == '"' and first_char == '"') \
or (last_char == '\u2019' and first_char == '\u2018') \ or (last_char == '' and first_char == '') \
or (last_char == '\u201d' and first_char == '\u201c')) or (last_char == '' and first_char == ''))
class Font: class Font:

View File

@ -24,9 +24,9 @@ def normalize_entities(cur_title):
'\u2013':'-', '\u2013':'-',
'&mdash;': '-', '&mdash;': '-',
'&ndash;': '-', '&ndash;': '-',
'\u00A0': ' ', '\u00a0': ' ',
'\u00AB': '"', '\u00ab': '"',
'\u00BB': '"', '\u00bb': '"',
'&quot;': '"', '&quot;': '"',
} }
for c, r in iteritems(entities): for c, r in iteritems(entities):

View File

@ -1693,7 +1693,7 @@ def elided_text(text, font=None, width=300, pos='middle'):
font = QApplication.instance().font() font = QApplication.instance().font()
fm = (font if isinstance(font, QFontMetrics) else QFontMetrics(font)) fm = (font if isinstance(font, QFontMetrics) else QFontMetrics(font))
delta = 4 delta = 4
ellipsis = '\u2026' ellipsis = ''
def remove_middle(x): def remove_middle(x):
mid = len(x) // 2 mid = len(x) // 2

View File

@ -121,13 +121,13 @@ class PluginWidget(QWidget,Ui_Form):
'name':_('Read book'), 'name':_('Read book'),
'field':_('Tags'), 'field':_('Tags'),
'pattern':'+', 'pattern':'+',
'prefix':'\u2713'}, 'prefix':''},
{'ordinal':1, {'ordinal':1,
'enabled':True, 'enabled':True,
'name':_('Wishlist item'), 'name':_('Wishlist item'),
'field':_('Tags'), 'field':_('Tags'),
'pattern':'Wishlist', 'pattern':'Wishlist',
'prefix':'\u00d7'},], 'prefix':'×'},],
['table_widget','table_widget'])) ['table_widget','table_widget']))
self.OPTION_FIELDS = option_fields self.OPTION_FIELDS = option_fields
@ -1351,52 +1351,52 @@ class PrefixRules(GenericRulesTable):
# Create a list of prefixes for user selection # Create a list of prefixes for user selection
raw_prefix_list = [ raw_prefix_list = [
('Ampersand', '&'), ('Ampersand', '&'),
('Angle left double', '\u00ab'), ('Angle left double', '«'),
('Angle left', '\u2039'), ('Angle left', ''),
('Angle right double', '\u00bb'), ('Angle right double', '»'),
('Angle right', '\u203a'), ('Angle right', ''),
('Arrow carriage return', '\u21b5'), ('Arrow carriage return', ''),
('Arrow double', '\u2194'), ('Arrow double', ''),
('Arrow down', '\u2193'), ('Arrow down', ''),
('Arrow left', '\u2190'), ('Arrow left', ''),
('Arrow right', '\u2192'), ('Arrow right', ''),
('Arrow up', '\u2191'), ('Arrow up', ''),
('Asterisk', '*'), ('Asterisk', '*'),
('At sign', '@'), ('At sign', '@'),
('Bullet smallest', '\u22c5'), ('Bullet smallest', ''),
('Bullet small', '\u00b7'), ('Bullet small', '·'),
('Bullet', '\u2022'), ('Bullet', ''),
('Cards clubs', '\u2663'), ('Cards clubs', ''),
('Cards diamonds', '\u2666'), ('Cards diamonds', ''),
('Cards hearts', '\u2665'), ('Cards hearts', ''),
('Cards spades', '\u2660'), ('Cards spades', ''),
('Caret', '^'), ('Caret', '^'),
('Checkmark', '\u2713'), ('Checkmark', ''),
('Copyright circle c', '\u00a9'), ('Copyright circle c', '©'),
('Copyright circle r', '\u00ae'), ('Copyright circle r', '®'),
('Copyright trademark', '\u2122'), ('Copyright trademark', ''),
('Currency cent', '\u00a2'), ('Currency cent', '¢'),
('Currency dollar', '$'), ('Currency dollar', '$'),
('Currency euro', '\u20ac'), ('Currency euro', ''),
('Currency pound', '\u00a3'), ('Currency pound', '£'),
('Currency yen', '\u00a5'), ('Currency yen', '¥'),
('Dagger double', '\u2021'), ('Dagger double', ''),
('Dagger', '\u2020'), ('Dagger', ''),
('Degree', '\u00b0'), ('Degree', '°'),
('Dots3', '\u2234'), ('Dots3', ''),
('Hash', '#'), ('Hash', '#'),
('Infinity', '\u221e'), ('Infinity', ''),
('Lozenge', '\u25ca'), ('Lozenge', ''),
('Math divide', '\u00f7'), ('Math divide', '÷'),
('Math empty', '\u2205'), ('Math empty', ''),
('Math equals', '='), ('Math equals', '='),
('Math minus', '\u2212'), ('Math minus', ''),
('Math plus circled', '\u2295'), ('Math plus circled', ''),
('Math times circled', '\u2297'), ('Math times circled', ''),
('Math times', '\u00d7'), ('Math times', '×'),
('Paragraph', '\u00b6'), ('Paragraph', ''),
('Percent', '%'), ('Percent', '%'),
('Plus-or-minus', '\u00b1'), ('Plus-or-minus', '±'),
('Plus', '+'), ('Plus', '+'),
('Punctuation colon', ':'), ('Punctuation colon', ':'),
('Punctuation colon-semi', ';'), ('Punctuation colon-semi', ';'),
@ -1405,10 +1405,10 @@ class PrefixRules(GenericRulesTable):
('Punctuation period', '.'), ('Punctuation period', '.'),
('Punctuation slash back', '\\'), ('Punctuation slash back', '\\'),
('Punctuation slash forward', '/'), ('Punctuation slash forward', '/'),
('Section', '\u00a7'), ('Section', '§'),
('Tilde', '~'), ('Tilde', '~'),
('Vertical bar', '|'), ('Vertical bar', '|'),
('Vertical bar broken', '\u00a6'), ('Vertical bar broken', '¦'),
('_0', '0'), ('_0', '0'),
('_1', '1'), ('_1', '1'),
('_2', '2'), ('_2', '2'),

View File

@ -465,7 +465,7 @@ class MetadataSingleDialogBase(QDialog):
def update_window_title(self, *args): def update_window_title(self, *args):
title = self.title.current_val title = self.title.current_val
if len(title) > 50: if len(title) > 50:
title = title[:50] + '\u2026' title = title[:50] + ''
self.setWindowTitle(BASE_TITLE + ' - ' + self.setWindowTitle(BASE_TITLE + ' - ' +
title + ' -' + title + ' -' +
_(' [%(num)d of %(tot)d]')%dict(num=self.current_row+1, _(' [%(num)d of %(tot)d]')%dict(num=self.current_row+1,

View File

@ -113,7 +113,7 @@ def format_price_in_RUR(price):
''' '''
if price and re.match(r'^\d*?\.\d*?$', price): if price and re.match(r'^\d*?\.\d*?$', price):
try: try:
price = u'{:,.2F} \u20BD'.format(float(price)) # \u20BD => руб. price = u'{:,.2F} \u20bd'.format(float(price)) # \u20bd => руб.
price = price.replace(',', ' ').replace('.', ',', 1) price = price.replace(',', ' ').replace('.', ',', 1)
except: except:
pass pass

View File

@ -156,7 +156,7 @@ class EPUB_MOBI(CatalogPlugin):
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, EPUB, MOBI output formats")), "Applies to: AZW3, EPUB, MOBI output formats")),
Option('--prefix-rules', Option('--prefix-rules',
default="(('Read books','tags','+','\u2713'),('Wishlist item','tags','Wishlist','\u00d7'))", default="(('Read books','tags','+',''),('Wishlist item','tags','Wishlist','×'))",
dest='prefix_rules', dest='prefix_rules',
action=None, action=None,
help=_("Specifies the rules used to include prefixes indicating read books, wishlist items and other user-specified prefixes.\n" help=_("Specifies the rules used to include prefixes indicating read books, wishlist items and other user-specified prefixes.\n"

View File

@ -227,7 +227,7 @@ class Tester(SearchQueryParser):
'London : Jonathan Cape, 2005.', 'London : Jonathan Cape, 2005.',
'lrf,txt'], 'lrf,txt'],
259: ['My name is Red', 259: ['My name is Red',
'Orhan Pamuk; translated from the Turkish by Erda\u011f G\xf6knar', 'Orhan Pamuk; translated from the Turkish by Erdağ G\xf6knar',
'New York : Alfred A. Knopf, 2001.', 'New York : Alfred A. Knopf, 2001.',
'lit,lrf'], 'lit,lrf'],
265: ['Harbinger', 'David Mack', 'Star Trek', 'lit,lrf'], 265: ['Harbinger', 'David Mack', 'Star Trek', 'lit,lrf'],

View File

@ -286,7 +286,7 @@ def test():
text = [colored(t, fg=t)+'. '+colored(t, fg=t, bold=True)+'.' for t in text = [colored(t, fg=t)+'. '+colored(t, fg=t, bold=True)+'.' for t in
('red', 'yellow', 'green', 'white', 'cyan', 'magenta', 'blue',)] ('red', 'yellow', 'green', 'white', 'cyan', 'magenta', 'blue',)]
s.write('\n'.join(text)) s.write('\n'.join(text))
u = '\u041c\u0438\u0445\u0430\u0438\u043b fällen' u = 'Михаил fällen'
print() print()
s.write(u) s.write(u)
print() print()

View File

@ -1183,7 +1183,7 @@ class BasicNewsRecipe(Recipe):
from calibre.utils.cleantext import clean_xml_chars from calibre.utils.cleantext import clean_xml_chars
# Truncating the string could cause a dangling UTF-16 half-surrogate, which will cause lxml to barf, clean it # Truncating the string could cause a dangling UTF-16 half-surrogate, which will cause lxml to barf, clean it
ans = clean_xml_chars(ans) + '\u2026' ans = clean_xml_chars(ans) + ''
return ans return ans
def feed2index(self, f, feeds): def feed2index(self, f, feeds):