decode unicode-escape (extra-edit)

This commit is contained in:
un-pogaz 2025-01-24 11:14:16 +01:00
parent 12cb8b2e58
commit 534293eabc
17 changed files with 87 additions and 87 deletions

View File

@ -299,8 +299,8 @@ class iPadOutput(OutputProfile):
}
]
ratings_char = '\u2605' # filled star
empty_ratings_char = '\u2606' # hollow star
ratings_char = '' # filled star
empty_ratings_char = '' # hollow star
touchscreen = True
# touchscreen_news_css {{{
@ -677,8 +677,8 @@ class KindleOutput(OutputProfile):
supports_mobi_indexing = True
periodical_date_in_title = False
empty_ratings_char = '\u2606'
ratings_char = '\u2605'
empty_ratings_char = ''
ratings_char = ''
mobi_ems_per_blockquote = 2.0
@ -696,8 +696,8 @@ class KindleDXOutput(OutputProfile):
# comic_screen_size = (741, 1022)
supports_mobi_indexing = True
periodical_date_in_title = False
empty_ratings_char = '\u2606'
ratings_char = '\u2605'
empty_ratings_char = ''
ratings_char = ''
mobi_ems_per_blockquote = 2.0

View File

@ -77,7 +77,7 @@ def _get_comments(soup):
pages = (_metadata_from_span(soup, 'pages') or _metadata_from_table(soup, 'pages'))
try:
# date span can have copyright symbols in it...
date = date.replace('\u00a9', '').strip()
date = date.replace('©', '').strip()
# and pages often comes as '(\d+ pages)'
pages = re.search(r'\d+', pages).group(0)
return f'Published {date}, {pages} pages.'

View File

@ -19,20 +19,20 @@ XLINK_NS = 'http://www.w3.org/1999/xlink'
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
LIGATURES = {
# '\u00c6': 'AE',
# '\u00e6': 'ae',
# '\u0152': 'OE',
# '\u0153': 'oe',
# '\u0132': 'IJ',
# '\u0133': 'ij',
# '\u1D6B': 'ue',
'\uFB00': 'ff',
'\uFB01': 'fi',
'\uFB02': 'fl',
'\uFB03': 'ffi',
'\uFB04': 'ffl',
'\uFB05': 'ft',
'\uFB06': 'st',
# 'Æ': 'AE',
# 'æ': 'ae',
# 'Œ': 'OE',
# 'œ': 'oe',
# 'IJ': 'IJ',
# 'ij': 'ij',
# '': 'ue',
'': 'ff',
'': 'fi',
'': 'fl',
'': 'ffi',
'': 'ffl',
'': 'ft',
'': 'st',
}
_ligpat = re.compile('|'.join(LIGATURES))
@ -240,7 +240,7 @@ class Dehyphenator:
else:
if self.verbose > 2:
self.log(' Cleanup:returning original text '+firsthalf+' + linefeed '+secondhalf)
return firsthalf+'\u2014'+wraptags+secondhalf
return firsthalf+''+wraptags+secondhalf
else:
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
@ -274,7 +274,7 @@ class Dehyphenator:
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
elif format == 'txt':
intextmatch = re.compile(
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|)( |\t)*(?P<wraptags>(\n( |\t)*)+)(?P<secondpart>[\\w\\d]+)'% length)
elif format == 'individual_words':
intextmatch = re.compile(
r'(?!<)(?P<firstpart>[^\W\-]+)(-|)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)

View File

@ -2194,8 +2194,8 @@ def find_tests():
t('a&amp;b&lt;c', 'a&b<c')
t('a&acE;b', 'a∾̳b')
t('a&#1234;b', 'aӒb')
t('a&#X1234;b', 'a\u1234b')
t('a&#x1034fA;b', 'a\U001034fAb')
t('a&#X1234;b', 'ab')
t('a&#x1034fA;b', 'a\U001034FAb')
t('a&#0;b&#x000;c', 'abc')
x('&amp;&lt;&gt;&apos;&quot;', '&amp;&lt;&gt;&apos;&quot;')

View File

@ -803,7 +803,7 @@ class HTMLConverter:
src = src.lstrip()
f = src[0]
next = 1
if f in ("'", '"', '\u201c', '\u2018', '\u201d', '\u2019'):
if f in ("'", '"', '', '', '', ''):
if len(src) >= 2:
next = 2
f = src[:2]
@ -819,7 +819,7 @@ class HTMLConverter:
def append_text(src):
fp, key, variant = self.font_properties(css)
for x, y in [('\xad', ''), ('\xa0', ' '), ('\ufb00', 'ff'), ('\ufb01', 'fi'), ('\ufb02', 'fl'), ('\ufb03', 'ffi'), ('\ufb04', 'ffl')]:
for x, y in [('\xad', ''), ('\xa0', ' '), ('', 'ff'), ('', 'fi'), ('', 'fl'), ('', 'ffi'), ('', 'ffl')]:
src = src.replace(x, y)
def valigner(x):
@ -1624,7 +1624,7 @@ class HTMLConverter:
in_ol = parent.name.lower() == 'ol'
break
parent = parent.parent
prepend = str(self.list_counter)+'. ' if in_ol else '\u2022' + ' '
prepend = str(self.list_counter)+'. ' if in_ol else '' + ' '
self.current_para.append(Span(prepend))
self.process_children(tag, tag_css, tag_pseudo_css)
if in_ol:

View File

@ -529,11 +529,11 @@ class MobiMLizer:
t = elem.text
if not t:
t = ''
elem.text = '\u201c' + t
elem.text = '' + t
t = elem.tail
if not t:
t = ''
elem.tail = '\u201d' + t
elem.tail = '' + t
text = None
if elem.text:
if istate.preserve or istate.pre_wrap:

View File

@ -362,7 +362,7 @@ class Serializer:
text = text.replace('&', '&amp;')
text = text.replace('<', '&lt;')
text = text.replace('>', '&gt;')
text = text.replace('\u00AD', '') # Soft-hyphen
text = text.replace('\u00ad', '') # Soft-hyphen
if quot:
text = text.replace('"', '&quot;')
if isinstance(text, str):

View File

@ -100,9 +100,9 @@ def adjacent_quotes(first_string, second_string):
if fchar is not None:
first_char = fchar.group(1) # First non-space char
return bool((last_char == '\u0022' and first_char == '\u0022') \
or (last_char == '\u2019' and first_char == '\u2018') \
or (last_char == '\u201d' and first_char == '\u201c'))
return bool((last_char == '"' and first_char == '"') \
or (last_char == '' and first_char == '') \
or (last_char == '' and first_char == ''))
class Font:

View File

@ -24,9 +24,9 @@ def normalize_entities(cur_title):
'\u2013':'-',
'&mdash;': '-',
'&ndash;': '-',
'\u00A0': ' ',
'\u00AB': '"',
'\u00BB': '"',
'\u00a0': ' ',
'\u00ab': '"',
'\u00bb': '"',
'&quot;': '"',
}
for c, r in iteritems(entities):

View File

@ -1693,7 +1693,7 @@ def elided_text(text, font=None, width=300, pos='middle'):
font = QApplication.instance().font()
fm = (font if isinstance(font, QFontMetrics) else QFontMetrics(font))
delta = 4
ellipsis = '\u2026'
ellipsis = ''
def remove_middle(x):
mid = len(x) // 2

View File

@ -121,13 +121,13 @@ class PluginWidget(QWidget,Ui_Form):
'name':_('Read book'),
'field':_('Tags'),
'pattern':'+',
'prefix':'\u2713'},
'prefix':''},
{'ordinal':1,
'enabled':True,
'name':_('Wishlist item'),
'field':_('Tags'),
'pattern':'Wishlist',
'prefix':'\u00d7'},],
'prefix':'×'},],
['table_widget','table_widget']))
self.OPTION_FIELDS = option_fields
@ -1351,52 +1351,52 @@ class PrefixRules(GenericRulesTable):
# Create a list of prefixes for user selection
raw_prefix_list = [
('Ampersand', '&'),
('Angle left double', '\u00ab'),
('Angle left', '\u2039'),
('Angle right double', '\u00bb'),
('Angle right', '\u203a'),
('Arrow carriage return', '\u21b5'),
('Arrow double', '\u2194'),
('Arrow down', '\u2193'),
('Arrow left', '\u2190'),
('Arrow right', '\u2192'),
('Arrow up', '\u2191'),
('Angle left double', '«'),
('Angle left', ''),
('Angle right double', '»'),
('Angle right', ''),
('Arrow carriage return', ''),
('Arrow double', ''),
('Arrow down', ''),
('Arrow left', ''),
('Arrow right', ''),
('Arrow up', ''),
('Asterisk', '*'),
('At sign', '@'),
('Bullet smallest', '\u22c5'),
('Bullet small', '\u00b7'),
('Bullet', '\u2022'),
('Cards clubs', '\u2663'),
('Cards diamonds', '\u2666'),
('Cards hearts', '\u2665'),
('Cards spades', '\u2660'),
('Bullet smallest', ''),
('Bullet small', '·'),
('Bullet', ''),
('Cards clubs', ''),
('Cards diamonds', ''),
('Cards hearts', ''),
('Cards spades', ''),
('Caret', '^'),
('Checkmark', '\u2713'),
('Copyright circle c', '\u00a9'),
('Copyright circle r', '\u00ae'),
('Copyright trademark', '\u2122'),
('Currency cent', '\u00a2'),
('Checkmark', ''),
('Copyright circle c', '©'),
('Copyright circle r', '®'),
('Copyright trademark', ''),
('Currency cent', '¢'),
('Currency dollar', '$'),
('Currency euro', '\u20ac'),
('Currency pound', '\u00a3'),
('Currency yen', '\u00a5'),
('Dagger double', '\u2021'),
('Dagger', '\u2020'),
('Degree', '\u00b0'),
('Dots3', '\u2234'),
('Currency euro', ''),
('Currency pound', '£'),
('Currency yen', '¥'),
('Dagger double', ''),
('Dagger', ''),
('Degree', '°'),
('Dots3', ''),
('Hash', '#'),
('Infinity', '\u221e'),
('Lozenge', '\u25ca'),
('Math divide', '\u00f7'),
('Math empty', '\u2205'),
('Infinity', ''),
('Lozenge', ''),
('Math divide', '÷'),
('Math empty', ''),
('Math equals', '='),
('Math minus', '\u2212'),
('Math plus circled', '\u2295'),
('Math times circled', '\u2297'),
('Math times', '\u00d7'),
('Paragraph', '\u00b6'),
('Math minus', ''),
('Math plus circled', ''),
('Math times circled', ''),
('Math times', '×'),
('Paragraph', ''),
('Percent', '%'),
('Plus-or-minus', '\u00b1'),
('Plus-or-minus', '±'),
('Plus', '+'),
('Punctuation colon', ':'),
('Punctuation colon-semi', ';'),
@ -1405,10 +1405,10 @@ class PrefixRules(GenericRulesTable):
('Punctuation period', '.'),
('Punctuation slash back', '\\'),
('Punctuation slash forward', '/'),
('Section', '\u00a7'),
('Section', '§'),
('Tilde', '~'),
('Vertical bar', '|'),
('Vertical bar broken', '\u00a6'),
('Vertical bar broken', '¦'),
('_0', '0'),
('_1', '1'),
('_2', '2'),

View File

@ -465,7 +465,7 @@ class MetadataSingleDialogBase(QDialog):
def update_window_title(self, *args):
title = self.title.current_val
if len(title) > 50:
title = title[:50] + '\u2026'
title = title[:50] + ''
self.setWindowTitle(BASE_TITLE + ' - ' +
title + ' -' +
_(' [%(num)d of %(tot)d]')%dict(num=self.current_row+1,

View File

@ -113,7 +113,7 @@ def format_price_in_RUR(price):
'''
if price and re.match(r'^\d*?\.\d*?$', price):
try:
price = u'{:,.2F} \u20BD'.format(float(price)) # \u20BD => руб.
price = u'{:,.2F} \u20bd'.format(float(price)) # \u20bd => руб.
price = price.replace(',', ' ').replace('.', ',', 1)
except:
pass

View File

@ -156,7 +156,7 @@ class EPUB_MOBI(CatalogPlugin):
"Default: '%default'\n"
"Applies to: AZW3, EPUB, MOBI output formats")),
Option('--prefix-rules',
default="(('Read books','tags','+','\u2713'),('Wishlist item','tags','Wishlist','\u00d7'))",
default="(('Read books','tags','+',''),('Wishlist item','tags','Wishlist','×'))",
dest='prefix_rules',
action=None,
help=_("Specifies the rules used to include prefixes indicating read books, wishlist items and other user-specified prefixes.\n"

View File

@ -227,7 +227,7 @@ class Tester(SearchQueryParser):
'London : Jonathan Cape, 2005.',
'lrf,txt'],
259: ['My name is Red',
'Orhan Pamuk; translated from the Turkish by Erda\u011f G\xf6knar',
'Orhan Pamuk; translated from the Turkish by Erdağ G\xf6knar',
'New York : Alfred A. Knopf, 2001.',
'lit,lrf'],
265: ['Harbinger', 'David Mack', 'Star Trek', 'lit,lrf'],

View File

@ -286,7 +286,7 @@ def test():
text = [colored(t, fg=t)+'. '+colored(t, fg=t, bold=True)+'.' for t in
('red', 'yellow', 'green', 'white', 'cyan', 'magenta', 'blue',)]
s.write('\n'.join(text))
u = '\u041c\u0438\u0445\u0430\u0438\u043b fällen'
u = 'Михаил fällen'
print()
s.write(u)
print()

View File

@ -1183,7 +1183,7 @@ class BasicNewsRecipe(Recipe):
from calibre.utils.cleantext import clean_xml_chars
# Truncating the string could cause a dangling UTF-16 half-surrogate, which will cause lxml to barf, clean it
ans = clean_xml_chars(ans) + '\u2026'
ans = clean_xml_chars(ans) + ''
return ans
def feed2index(self, f, feeds):