mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-11-20 21:43:04 -05:00
Python2 had old-style classes (no "(object)"), and new style classes (with "object"). Under Py3 this is a noop, so let's drop it to make the code a bit shorter.
255 lines
8.9 KiB
Python
255 lines
8.9 KiB
Python
# coding: utf8
|
||
"""
|
||
tinycss.decoding
|
||
----------------
|
||
|
||
Decoding stylesheets from bytes to Unicode.
|
||
http://www.w3.org/TR/CSS21/syndata.html#charset
|
||
|
||
:copyright: (c) 2012 by Simon Sapin.
|
||
:license: BSD, see LICENSE for more details.
|
||
"""
|
||
|
||
|
||
import operator
|
||
import re
|
||
|
||
from polyglot.binary import from_hex_bytes
|
||
|
||
|
||
__all__ = ['decode'] # Everything else is implementation detail
|
||
|
||
|
||
def decode(css_bytes, protocol_encoding=None,
|
||
linking_encoding=None, document_encoding=None):
|
||
"""
|
||
Determine the character encoding from the passed metadata and the
|
||
``@charset`` rule in the stylesheet (if any); and decode accordingly.
|
||
If no encoding information is available or decoding fails,
|
||
decoding defaults to UTF-8 and then fall back on ISO-8859-1.
|
||
|
||
:param css_bytes:
|
||
a CSS stylesheet as a byte string
|
||
:param protocol_encoding:
|
||
The "charset" parameter of a "Content-Type" HTTP header (if any),
|
||
or similar metadata for other protocols.
|
||
:param linking_encoding:
|
||
``<link charset="">`` or other metadata from the linking mechanism
|
||
(if any)
|
||
:param document_encoding:
|
||
Encoding of the referring style sheet or document (if any)
|
||
:return:
|
||
A tuple of an Unicode string, with any BOM removed, and the
|
||
encoding that was used.
|
||
|
||
"""
|
||
if protocol_encoding:
|
||
css_unicode = try_encoding(css_bytes, protocol_encoding)
|
||
if css_unicode is not None:
|
||
return css_unicode, protocol_encoding
|
||
for encoding, pattern in ENCODING_MAGIC_NUMBERS:
|
||
match = pattern(css_bytes)
|
||
if match:
|
||
has_at_charset = isinstance(encoding, tuple)
|
||
if has_at_charset:
|
||
extract, endianness = encoding
|
||
encoding = extract(match.group(1))
|
||
# Get an ASCII-only unicode value.
|
||
# This is the only thing that works on both Python 2 and 3
|
||
# for bytes.decode()
|
||
# Non-ASCII encoding names are invalid anyway,
|
||
# but make sure they stay invalid.
|
||
encoding = encoding.decode('ascii', 'replace')
|
||
encoding = encoding.replace('\ufffd', '?')
|
||
if encoding.replace('-', '').replace('_', '').lower() in [
|
||
'utf16', 'utf32']:
|
||
encoding += endianness
|
||
encoding = encoding.encode('ascii', 'replace').decode('ascii')
|
||
css_unicode = try_encoding(css_bytes, encoding)
|
||
if css_unicode and not (has_at_charset and not
|
||
css_unicode.startswith('@charset "')):
|
||
return css_unicode, encoding
|
||
break
|
||
for encoding in [linking_encoding, document_encoding]:
|
||
if encoding:
|
||
css_unicode = try_encoding(css_bytes, encoding)
|
||
if css_unicode is not None:
|
||
return css_unicode, encoding
|
||
css_unicode = try_encoding(css_bytes, 'UTF-8')
|
||
if css_unicode is not None:
|
||
return css_unicode, 'UTF-8'
|
||
return try_encoding(css_bytes, 'ISO-8859-1', fallback=False), 'ISO-8859-1'
|
||
|
||
|
||
def try_encoding(css_bytes, encoding, fallback=True):
|
||
if fallback:
|
||
try:
|
||
css_unicode = css_bytes.decode(encoding)
|
||
# LookupError means unknown encoding
|
||
except (UnicodeDecodeError, LookupError):
|
||
return None
|
||
else:
|
||
css_unicode = css_bytes.decode(encoding)
|
||
if css_unicode and css_unicode[0] == '\ufeff':
|
||
# Remove any Byte Order Mark
|
||
css_unicode = css_unicode[1:]
|
||
return css_unicode
|
||
|
||
|
||
def hex2re(hex_data):
|
||
return re.escape(from_hex_bytes(hex_data.replace(' ', '').encode('ascii')))
|
||
|
||
|
||
class Slicer:
|
||
"""Slice()[start:stop:end] == slice(start, stop, end)"""
|
||
def __getitem__(self, slice_):
|
||
return operator.itemgetter(slice_)
|
||
|
||
|
||
Slice = Slicer()
|
||
|
||
|
||
# List of (bom_size, encoding, pattern)
|
||
# bom_size is in bytes and can be zero
|
||
# encoding is a string or (slice_, endianness) for "as specified"
|
||
# slice_ is a slice object.How to extract the specified
|
||
|
||
ENCODING_MAGIC_NUMBERS = [
|
||
((Slice[:], ''), re.compile(
|
||
hex2re('EF BB BF 40 63 68 61 72 73 65 74 20 22')
|
||
+ b'([^\x22]*?)'
|
||
+ hex2re('22 3B')).match),
|
||
|
||
('UTF-8', re.compile(
|
||
hex2re('EF BB BF')).match),
|
||
|
||
((Slice[:], ''), re.compile(
|
||
hex2re('40 63 68 61 72 73 65 74 20 22')
|
||
+ b'([^\x22]*?)'
|
||
+ hex2re('22 3B')).match),
|
||
|
||
((Slice[1::2], '-BE'), re.compile(
|
||
hex2re('FE FF 00 40 00 63 00 68 00 61 00 72 00 73 00 65 00'
|
||
'74 00 20 00 22')
|
||
+ b'((\x00[^\x22])*?)'
|
||
+ hex2re('00 22 00 3B')).match),
|
||
|
||
((Slice[1::2], '-BE'), re.compile(
|
||
hex2re('00 40 00 63 00 68 00 61 00 72 00 73 00 65 00 74 00'
|
||
'20 00 22')
|
||
+ b'((\x00[^\x22])*?)'
|
||
+ hex2re('00 22 00 3B')).match),
|
||
|
||
((Slice[::2], '-LE'), re.compile(
|
||
hex2re('FF FE 40 00 63 00 68 00 61 00 72 00 73 00 65 00 74'
|
||
'00 20 00 22 00')
|
||
+ b'(([^\x22]\x00)*?)'
|
||
+ hex2re('22 00 3B 00')).match),
|
||
|
||
((Slice[::2], '-LE'), re.compile(
|
||
hex2re('40 00 63 00 68 00 61 00 72 00 73 00 65 00 74 00 20'
|
||
'00 22 00')
|
||
+ b'(([^\x22]\x00)*?)'
|
||
+ hex2re('22 00 3B 00')).match),
|
||
|
||
((Slice[3::4], '-BE'), re.compile(
|
||
hex2re('00 00 FE FF 00 00 00 40 00 00 00 63 00 00 00 68 00'
|
||
'00 00 61 00 00 00 72 00 00 00 73 00 00 00 65 00 00'
|
||
'00 74 00 00 00 20 00 00 00 22')
|
||
+ b'((\x00\x00\x00[^\x22])*?)'
|
||
+ hex2re('00 00 00 22 00 00 00 3B')).match),
|
||
|
||
((Slice[3::4], '-BE'), re.compile(
|
||
hex2re('00 00 00 40 00 00 00 63 00 00 00 68 00 00 00 61 00'
|
||
'00 00 72 00 00 00 73 00 00 00 65 00 00 00 74 00 00'
|
||
'00 20 00 00 00 22')
|
||
+ b'((\x00\x00\x00[^\x22])*?)'
|
||
+ hex2re('00 00 00 22 00 00 00 3B')).match),
|
||
|
||
|
||
# Python does not support 2143 or 3412 endianness, AFAIK.
|
||
# I guess we could fix it up ourselves but meh. Patches welcome.
|
||
|
||
# ((Slice[2::4], '-2143'), re.compile(
|
||
# hex2re('00 00 FF FE 00 00 40 00 00 00 63 00 00 00 68 00 00'
|
||
# '00 61 00 00 00 72 00 00 00 73 00 00 00 65 00 00 00'
|
||
# '74 00 00 00 20 00 00 00 22 00')
|
||
# + b'((\x00\x00[^\x22]\x00)*?)'
|
||
# + hex2re('00 00 22 00 00 00 3B 00')).match),
|
||
|
||
# ((Slice[2::4], '-2143'), re.compile(
|
||
# hex2re('00 00 40 00 00 00 63 00 00 00 68 00 00 00 61 00 00'
|
||
# '00 72 00 00 00 73 00 00 00 65 00 00 00 74 00 00 00'
|
||
# '20 00 00 00 22 00')
|
||
# + b'((\x00\x00[^\x22]\x00)*?)'
|
||
# + hex2re('00 00 22 00 00 00 3B 00')).match),
|
||
|
||
# ((Slice[1::4], '-3412'), re.compile(
|
||
# hex2re('FE FF 00 00 00 40 00 00 00 63 00 00 00 68 00 00 00'
|
||
# '61 00 00 00 72 00 00 00 73 00 00 00 65 00 00 00 74'
|
||
# '00 00 00 20 00 00 00 22 00 00')
|
||
# + b'((\x00[^\x22]\x00\x00)*?)'
|
||
# + hex2re('00 22 00 00 00 3B 00 00')).match),
|
||
|
||
# ((Slice[1::4], '-3412'), re.compile(
|
||
# hex2re('00 40 00 00 00 63 00 00 00 68 00 00 00 61 00 00 00'
|
||
# '72 00 00 00 73 00 00 00 65 00 00 00 74 00 00 00 20'
|
||
# '00 00 00 22 00 00')
|
||
# + b'((\x00[^\x22]\x00\x00)*?)'
|
||
# + hex2re('00 22 00 00 00 3B 00 00')).match),
|
||
|
||
((Slice[::4], '-LE'), re.compile(
|
||
hex2re('FF FE 00 00 40 00 00 00 63 00 00 00 68 00 00 00 61'
|
||
'00 00 00 72 00 00 00 73 00 00 00 65 00 00 00 74 00'
|
||
'00 00 20 00 00 00 22 00 00 00')
|
||
+ b'(([^\x22]\x00\x00\x00)*?)'
|
||
+ hex2re('22 00 00 00 3B 00 00 00')).match),
|
||
|
||
((Slice[::4], '-LE'), re.compile(
|
||
hex2re('40 00 00 00 63 00 00 00 68 00 00 00 61 00 00 00 72'
|
||
'00 00 00 73 00 00 00 65 00 00 00 74 00 00 00 20 00'
|
||
'00 00 22 00 00 00')
|
||
+ b'(([^\x22]\x00\x00\x00)*?)'
|
||
+ hex2re('22 00 00 00 3B 00 00 00')).match),
|
||
|
||
('UTF-32-BE', re.compile(
|
||
hex2re('00 00 FE FF')).match),
|
||
|
||
('UTF-32-LE', re.compile(
|
||
hex2re('FF FE 00 00')).match),
|
||
|
||
# ('UTF-32-2143', re.compile(
|
||
# hex2re('00 00 FF FE')).match),
|
||
|
||
# ('UTF-32-3412', re.compile(
|
||
# hex2re('FE FF 00 00')).match),
|
||
|
||
('UTF-16-BE', re.compile(
|
||
hex2re('FE FF')).match),
|
||
|
||
('UTF-16-LE', re.compile(
|
||
hex2re('FF FE')).match),
|
||
|
||
|
||
# Some of there are supported by Python, but I didn’t bother.
|
||
# You know the story with patches ...
|
||
|
||
# # as specified, transcoded from EBCDIC to ASCII
|
||
# ('as_specified-EBCDIC', re.compile(
|
||
# hex2re('7C 83 88 81 99 A2 85 A3 40 7F')
|
||
# + b'([^\x7F]*?)'
|
||
# + hex2re('7F 5E')).match),
|
||
|
||
# # as specified, transcoded from IBM1026 to ASCII
|
||
# ('as_specified-IBM1026', re.compile(
|
||
# hex2re('AE 83 88 81 99 A2 85 A3 40 FC')
|
||
# + b'([^\xFC]*?)'
|
||
# + hex2re('FC 5E')).match),
|
||
|
||
# # as specified, transcoded from GSM 03.38 to ASCII
|
||
# ('as_specified-GSM_03.38', re.compile(
|
||
# hex2re('00 63 68 61 72 73 65 74 20 22')
|
||
# + b'([^\x22]*?)'
|
||
# + hex2re('22 3B')).match),
|
||
]
|