Implement serialization of tokens

This commit is contained in:
Kovid Goyal 2021-03-19 15:19:16 +05:30
parent 94756715ef
commit f1e4095225
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -32,7 +32,7 @@ is_surrogate(char32_t ch) {
} }
static inline bool static inline bool
is_hex_char(char32_t ch) { is_hex_digit(char32_t ch) {
return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') || ('A' <= ch || ch <= 'F'); return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') || ('A' <= ch || ch <= 'F');
} }
@ -41,14 +41,19 @@ is_letter(char32_t ch) {
return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'); return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z');
} }
static inline bool
is_digit(char32_t ch) {
return '0' <= ch && ch <= '9';
}
static inline bool static inline bool
is_name_start(char32_t ch) { is_name_start(char32_t ch) {
return is_letter(ch) || ch == '_' || ch >= 0x80; return is_letter(ch) || ch == '_' || ch >= 0x80;
} }
static inline bool static inline bool
is_digit(char32_t ch) { is_name_body(char32_t ch) {
return '0' <= ch && ch <= '9'; return is_name_start(ch) || is_digit(ch) || ch == '-';
} }
static inline bool static inline bool
@ -220,7 +225,6 @@ enum class TokenType : unsigned int {
whitespace, whitespace,
delimiter, delimiter,
ident, ident,
function,
at_keyword, at_keyword,
hash, hash,
string, string,
@ -228,7 +232,6 @@ enum class TokenType : unsigned int {
function_start, function_start,
number, number,
dimension, dimension,
percentage,
comment, comment,
cdo, cdo,
cdc cdc
@ -236,6 +239,8 @@ enum class TokenType : unsigned int {
class Token { class Token {
enum class NameSerializeState : unsigned { start, one_hyphen, body };
private: private:
TokenType type; TokenType type;
std::u32string text; std::u32string text;
@ -247,6 +252,55 @@ class Token {
unit_at = 0; unit_at = 0;
} }
void serialize_escaped_char(const char32_t ch, std::u32string &out) const {
out.push_back('\\');
if (is_whitespace(ch) || is_hex_digit(ch)) {
char buf[8];
int num = std::snprintf(buf, sizeof(buf), "%x ", ch);
if (num > 0) {
out.resize(out.size() + num);
for (int i = 0; i < num; i++) out[i + out.size() - num] = buf[i];
} else throw std::runtime_error("Failed to convert character to hexedecimal escape");
} else out.push_back(ch);
}
void serialize_ident(std::u32string &out) const {
NameSerializeState state = NameSerializeState::start;
for (const auto ch : text) {
switch(state) {
case NameSerializeState::start:
if (is_name_start(ch)) { out.push_back(ch); state = NameSerializeState::body; }
else if (ch == '-') { out.push_back(ch); state = NameSerializeState::one_hyphen; }
else throw std::logic_error("Unable to serialize ident because of invalid start character");
break;
case NameSerializeState::one_hyphen:
if (is_name_start(ch) || ch == '-') { out.push_back(ch); state = NameSerializeState::body; }
else serialize_escaped_char(ch, out);
break;
case NameSerializeState::body:
if (is_name_body(ch)) out.push_back(ch);
else serialize_escaped_char(ch, out);
break;
}
}
}
void serialize_hash(std::u32string &out) const {
for (const auto ch : text) {
if (is_name_body(ch)) out.push_back(ch);
else serialize_escaped_char(ch, out);
}
}
void serialize_string(std::u32string &out) const {
const char32_t delim = text.find('"') == std::u32string::npos ? '"' : '\'';
for (const auto ch : text) {
if (ch == '\n') out.append({'\\', '\n'});
else if (ch == delim || ch == '\\') serialize_escaped_char(ch, out);
else out.push_back(ch);
}
}
public: public:
Token() : Token() :
type(TokenType::whitespace), text(), unit_at(0), out_pos(0) { type(TokenType::whitespace), text(), unit_at(0), out_pos(0) {
@ -279,6 +333,7 @@ class Token {
TokenType get_type() const { return type; } TokenType get_type() const { return type; }
void set_type(const TokenType q) { type = q; } void set_type(const TokenType q) { type = q; }
size_t get_output_position() const { return out_pos; }
void set_output_position(const size_t val) { out_pos = val; } void set_output_position(const size_t val) { out_pos = val; }
bool is_type(const TokenType q) const { return type == q; } bool is_type(const TokenType q) const { return type == q; }
bool is_delimiter(const char32_t ch) const { return type == TokenType::delimiter && text.size() == 1 && text[0] == ch; } bool is_delimiter(const char32_t ch) const { return type == TokenType::delimiter && text.size() == 1 && text[0] == ch; }
@ -374,6 +429,53 @@ class Token {
set_text(scratch); set_text(scratch);
return true; return true;
} }
void serialize(std::u32string &out) const {
switch (type) {
case TokenType::whitespace:
case TokenType::delimiter:
out.append(text);
break;
case TokenType::ident:
serialize_ident(out);
break;
case TokenType::at_keyword:
out.push_back('@');
serialize_ident(out);
break;
case TokenType::hash:
out.push_back('#');
serialize_hash(out);
break;
case TokenType::string:
serialize_string(out);
break;
case TokenType::url:
out.append({'u', 'r', 'l', '('});
serialize_string(out);
out.push_back(')');
break;
case TokenType::function_start:
serialize_ident(out);
out.push_back('(');
break;
case TokenType::number:
case TokenType::dimension:
out.append(text);
break;
case TokenType::comment:
out.append({'/', '*'});
out.append(text);
out.append({'*', '/'});
break;
case TokenType::cdo:
out.append({'<', '!', '-', '-'});
break;
case TokenType::cdc:
out.append({'-', '-', '>'});
break;
}
}
}; };
class TokenQueue { class TokenQueue {
@ -384,6 +486,8 @@ class TokenQueue {
std::string scratch, scratch2; std::string scratch, scratch2;
pyobject_raii url_callback; pyobject_raii url_callback;
size_t current_output_position() const { return out.size(); }
void new_token(const TokenType type, const char32_t ch = 0) { void new_token(const TokenType type, const char32_t ch = 0) {
if (pool.empty()) queue.emplace_back(type, ch, current_output_position()); if (pool.empty()) queue.emplace_back(type, ch, current_output_position());
else { else {
@ -394,8 +498,6 @@ class TokenQueue {
} }
} }
size_t current_output_position() const { return out.size(); }
void add_char_of_type(const TokenType type, const char32_t ch) { void add_char_of_type(const TokenType type, const char32_t ch) {
if (!queue.empty() && queue.back().is_type(type)) queue.back().add_char(ch); if (!queue.empty() && queue.back().is_type(type)) queue.back().add_char(ch);
else new_token(type, ch); else new_token(type, ch);
@ -552,16 +654,16 @@ class TokenQueue {
void add_delimiter(const char32_t ch) { new_token(TokenType::delimiter, ch); } void add_delimiter(const char32_t ch) { new_token(TokenType::delimiter, ch); }
void add_hash() { new_token(TokenType::hash, '#'); } void add_hash() { new_token(TokenType::hash); }
void add_at_keyword() { new_token(TokenType::at_keyword, '@'); } void add_at_keyword() { new_token(TokenType::at_keyword); }
void add_number(const char32_t ch) { new_token(TokenType::number, ch); } void add_number(const char32_t ch) { new_token(TokenType::number, ch); }
void add_ident(const char32_t ch) { new_token(TokenType::ident, ch); } void add_ident(const char32_t ch) { new_token(TokenType::ident, ch); }
void add_cdc() { new_token(TokenType::cdc, '-'); queue.back().add_char('-'); queue.back().add_char('>'); } void add_cdc() { new_token(TokenType::cdc); }
void add_cdo() { new_token(TokenType::cdo, '<'); queue.back().add_char('-'); queue.back().add_char('-'); } void add_cdo() { new_token(TokenType::cdo); }
void mark_unit() { void mark_unit() {
if (queue.empty()) throw std::logic_error("Attempting to mark unit with no token present"); if (queue.empty()) throw std::logic_error("Attempting to mark unit with no token present");
@ -596,6 +698,10 @@ class TokenQueue {
} else { } else {
if (process_declaration()) changed = true; if (process_declaration()) changed = true;
} }
if (changed && queue.size()) {
out.resize(queue[0].get_output_position());
for (auto tok : queue) tok.serialize(out);
}
return_tokens_to_pool(); return_tokens_to_pool();
} }
}; };
@ -761,7 +867,7 @@ class Parser {
void handle_escape() { void handle_escape() {
if (!escape_buf_pos) { if (!escape_buf_pos) {
if (ch == '\n') { reconsume(); states.pop(); return; } if (ch == '\n') { reconsume(); states.pop(); return; }
if (!is_hex_char(ch)) { if (!is_hex_digit(ch)) {
states.pop(); states.pop();
token_queue.add_char(ch); token_queue.add_char(ch);
return; return;
@ -769,7 +875,7 @@ class Parser {
escape_buf[escape_buf_pos++] = (char)ch; escape_buf[escape_buf_pos++] = (char)ch;
return; return;
} }
if (is_hex_char(ch) && escape_buf_pos < 6) { escape_buf[escape_buf_pos++] = (char)ch; return; } if (is_hex_digit(ch) && escape_buf_pos < 6) { escape_buf[escape_buf_pos++] = (char)ch; return; }
if (is_whitespace(ch)) return; // a single whitespace character is absorbed into escape if (is_whitespace(ch)) return; // a single whitespace character is absorbed into escape
reconsume(); reconsume();
states.pop(); states.pop();
@ -994,7 +1100,7 @@ class Parser {
else token_queue.add_delimiter(ch); else token_queue.add_delimiter(ch);
break; break;
case '<': case '<':
if (is_top_level() && peek() == '-' && peek(1) == '-') { token_queue.add_cdo(); write_to_output(input.next()); write_to_output(input.next()); } if (is_top_level() && peek() == '!' && peek(1) == '-' && peek(2) == '-') { token_queue.add_cdo(); write_to_output(input.next()); write_to_output(input.next()); }
else token_queue.add_delimiter(ch); else token_queue.add_delimiter(ch);
break; break;
case '@': case '@':