mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implement serialization of tokens
This commit is contained in:
parent
94756715ef
commit
f1e4095225
@ -32,7 +32,7 @@ is_surrogate(char32_t ch) {
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_hex_char(char32_t ch) {
|
||||
is_hex_digit(char32_t ch) {
|
||||
return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') || ('A' <= ch || ch <= 'F');
|
||||
}
|
||||
|
||||
@ -41,14 +41,19 @@ is_letter(char32_t ch) {
|
||||
return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z');
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_digit(char32_t ch) {
|
||||
return '0' <= ch && ch <= '9';
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_name_start(char32_t ch) {
|
||||
return is_letter(ch) || ch == '_' || ch >= 0x80;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_digit(char32_t ch) {
|
||||
return '0' <= ch && ch <= '9';
|
||||
is_name_body(char32_t ch) {
|
||||
return is_name_start(ch) || is_digit(ch) || ch == '-';
|
||||
}
|
||||
|
||||
static inline bool
|
||||
@ -220,7 +225,6 @@ enum class TokenType : unsigned int {
|
||||
whitespace,
|
||||
delimiter,
|
||||
ident,
|
||||
function,
|
||||
at_keyword,
|
||||
hash,
|
||||
string,
|
||||
@ -228,7 +232,6 @@ enum class TokenType : unsigned int {
|
||||
function_start,
|
||||
number,
|
||||
dimension,
|
||||
percentage,
|
||||
comment,
|
||||
cdo,
|
||||
cdc
|
||||
@ -236,6 +239,8 @@ enum class TokenType : unsigned int {
|
||||
|
||||
|
||||
class Token {
|
||||
enum class NameSerializeState : unsigned { start, one_hyphen, body };
|
||||
|
||||
private:
|
||||
TokenType type;
|
||||
std::u32string text;
|
||||
@ -247,6 +252,55 @@ class Token {
|
||||
unit_at = 0;
|
||||
}
|
||||
|
||||
void serialize_escaped_char(const char32_t ch, std::u32string &out) const {
|
||||
out.push_back('\\');
|
||||
if (is_whitespace(ch) || is_hex_digit(ch)) {
|
||||
char buf[8];
|
||||
int num = std::snprintf(buf, sizeof(buf), "%x ", ch);
|
||||
if (num > 0) {
|
||||
out.resize(out.size() + num);
|
||||
for (int i = 0; i < num; i++) out[i + out.size() - num] = buf[i];
|
||||
} else throw std::runtime_error("Failed to convert character to hexedecimal escape");
|
||||
} else out.push_back(ch);
|
||||
}
|
||||
|
||||
void serialize_ident(std::u32string &out) const {
|
||||
NameSerializeState state = NameSerializeState::start;
|
||||
for (const auto ch : text) {
|
||||
switch(state) {
|
||||
case NameSerializeState::start:
|
||||
if (is_name_start(ch)) { out.push_back(ch); state = NameSerializeState::body; }
|
||||
else if (ch == '-') { out.push_back(ch); state = NameSerializeState::one_hyphen; }
|
||||
else throw std::logic_error("Unable to serialize ident because of invalid start character");
|
||||
break;
|
||||
case NameSerializeState::one_hyphen:
|
||||
if (is_name_start(ch) || ch == '-') { out.push_back(ch); state = NameSerializeState::body; }
|
||||
else serialize_escaped_char(ch, out);
|
||||
break;
|
||||
case NameSerializeState::body:
|
||||
if (is_name_body(ch)) out.push_back(ch);
|
||||
else serialize_escaped_char(ch, out);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void serialize_hash(std::u32string &out) const {
|
||||
for (const auto ch : text) {
|
||||
if (is_name_body(ch)) out.push_back(ch);
|
||||
else serialize_escaped_char(ch, out);
|
||||
}
|
||||
}
|
||||
|
||||
void serialize_string(std::u32string &out) const {
|
||||
const char32_t delim = text.find('"') == std::u32string::npos ? '"' : '\'';
|
||||
for (const auto ch : text) {
|
||||
if (ch == '\n') out.append({'\\', '\n'});
|
||||
else if (ch == delim || ch == '\\') serialize_escaped_char(ch, out);
|
||||
else out.push_back(ch);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
Token() :
|
||||
type(TokenType::whitespace), text(), unit_at(0), out_pos(0) {
|
||||
@ -279,6 +333,7 @@ class Token {
|
||||
|
||||
TokenType get_type() const { return type; }
|
||||
void set_type(const TokenType q) { type = q; }
|
||||
size_t get_output_position() const { return out_pos; }
|
||||
void set_output_position(const size_t val) { out_pos = val; }
|
||||
bool is_type(const TokenType q) const { return type == q; }
|
||||
bool is_delimiter(const char32_t ch) const { return type == TokenType::delimiter && text.size() == 1 && text[0] == ch; }
|
||||
@ -374,6 +429,53 @@ class Token {
|
||||
set_text(scratch);
|
||||
return true;
|
||||
}
|
||||
|
||||
void serialize(std::u32string &out) const {
|
||||
switch (type) {
|
||||
case TokenType::whitespace:
|
||||
case TokenType::delimiter:
|
||||
out.append(text);
|
||||
break;
|
||||
case TokenType::ident:
|
||||
serialize_ident(out);
|
||||
break;
|
||||
case TokenType::at_keyword:
|
||||
out.push_back('@');
|
||||
serialize_ident(out);
|
||||
break;
|
||||
case TokenType::hash:
|
||||
out.push_back('#');
|
||||
serialize_hash(out);
|
||||
break;
|
||||
case TokenType::string:
|
||||
serialize_string(out);
|
||||
break;
|
||||
case TokenType::url:
|
||||
out.append({'u', 'r', 'l', '('});
|
||||
serialize_string(out);
|
||||
out.push_back(')');
|
||||
break;
|
||||
case TokenType::function_start:
|
||||
serialize_ident(out);
|
||||
out.push_back('(');
|
||||
break;
|
||||
case TokenType::number:
|
||||
case TokenType::dimension:
|
||||
out.append(text);
|
||||
break;
|
||||
case TokenType::comment:
|
||||
out.append({'/', '*'});
|
||||
out.append(text);
|
||||
out.append({'*', '/'});
|
||||
break;
|
||||
case TokenType::cdo:
|
||||
out.append({'<', '!', '-', '-'});
|
||||
break;
|
||||
case TokenType::cdc:
|
||||
out.append({'-', '-', '>'});
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class TokenQueue {
|
||||
@ -384,6 +486,8 @@ class TokenQueue {
|
||||
std::string scratch, scratch2;
|
||||
pyobject_raii url_callback;
|
||||
|
||||
size_t current_output_position() const { return out.size(); }
|
||||
|
||||
void new_token(const TokenType type, const char32_t ch = 0) {
|
||||
if (pool.empty()) queue.emplace_back(type, ch, current_output_position());
|
||||
else {
|
||||
@ -394,8 +498,6 @@ class TokenQueue {
|
||||
}
|
||||
}
|
||||
|
||||
size_t current_output_position() const { return out.size(); }
|
||||
|
||||
void add_char_of_type(const TokenType type, const char32_t ch) {
|
||||
if (!queue.empty() && queue.back().is_type(type)) queue.back().add_char(ch);
|
||||
else new_token(type, ch);
|
||||
@ -552,16 +654,16 @@ class TokenQueue {
|
||||
|
||||
void add_delimiter(const char32_t ch) { new_token(TokenType::delimiter, ch); }
|
||||
|
||||
void add_hash() { new_token(TokenType::hash, '#'); }
|
||||
void add_hash() { new_token(TokenType::hash); }
|
||||
|
||||
void add_at_keyword() { new_token(TokenType::at_keyword, '@'); }
|
||||
void add_at_keyword() { new_token(TokenType::at_keyword); }
|
||||
|
||||
void add_number(const char32_t ch) { new_token(TokenType::number, ch); }
|
||||
|
||||
void add_ident(const char32_t ch) { new_token(TokenType::ident, ch); }
|
||||
|
||||
void add_cdc() { new_token(TokenType::cdc, '-'); queue.back().add_char('-'); queue.back().add_char('>'); }
|
||||
void add_cdo() { new_token(TokenType::cdo, '<'); queue.back().add_char('-'); queue.back().add_char('-'); }
|
||||
void add_cdc() { new_token(TokenType::cdc); }
|
||||
void add_cdo() { new_token(TokenType::cdo); }
|
||||
|
||||
void mark_unit() {
|
||||
if (queue.empty()) throw std::logic_error("Attempting to mark unit with no token present");
|
||||
@ -596,6 +698,10 @@ class TokenQueue {
|
||||
} else {
|
||||
if (process_declaration()) changed = true;
|
||||
}
|
||||
if (changed && queue.size()) {
|
||||
out.resize(queue[0].get_output_position());
|
||||
for (auto tok : queue) tok.serialize(out);
|
||||
}
|
||||
return_tokens_to_pool();
|
||||
}
|
||||
};
|
||||
@ -761,7 +867,7 @@ class Parser {
|
||||
void handle_escape() {
|
||||
if (!escape_buf_pos) {
|
||||
if (ch == '\n') { reconsume(); states.pop(); return; }
|
||||
if (!is_hex_char(ch)) {
|
||||
if (!is_hex_digit(ch)) {
|
||||
states.pop();
|
||||
token_queue.add_char(ch);
|
||||
return;
|
||||
@ -769,7 +875,7 @@ class Parser {
|
||||
escape_buf[escape_buf_pos++] = (char)ch;
|
||||
return;
|
||||
}
|
||||
if (is_hex_char(ch) && escape_buf_pos < 6) { escape_buf[escape_buf_pos++] = (char)ch; return; }
|
||||
if (is_hex_digit(ch) && escape_buf_pos < 6) { escape_buf[escape_buf_pos++] = (char)ch; return; }
|
||||
if (is_whitespace(ch)) return; // a single whitespace character is absorbed into escape
|
||||
reconsume();
|
||||
states.pop();
|
||||
@ -994,7 +1100,7 @@ class Parser {
|
||||
else token_queue.add_delimiter(ch);
|
||||
break;
|
||||
case '<':
|
||||
if (is_top_level() && peek() == '-' && peek(1) == '-') { token_queue.add_cdo(); write_to_output(input.next()); write_to_output(input.next()); }
|
||||
if (is_top_level() && peek() == '!' && peek(1) == '-' && peek(2) == '-') { token_queue.add_cdo(); write_to_output(input.next()); write_to_output(input.next()); }
|
||||
else token_queue.add_delimiter(ch);
|
||||
break;
|
||||
case '@':
|
||||
|
Loading…
x
Reference in New Issue
Block a user