|
|
@@ -19,6 +19,7 @@
|
|
|
#include "toolchain/base/shared_value_stores.h"
|
|
|
#include "toolchain/diagnostics/diagnostic_emitter.h"
|
|
|
#include "toolchain/lex/token_index.h"
|
|
|
+#include "toolchain/lex/token_info.h"
|
|
|
#include "toolchain/lex/token_kind.h"
|
|
|
#include "toolchain/source/source_buffer.h"
|
|
|
|
|
|
@@ -177,7 +178,7 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
|
|
|
auto GetIndentColumnNumber(LineIndex line) const -> int;
|
|
|
|
|
|
auto GetByteOffset(TokenIndex token) const -> int32_t {
|
|
|
- return GetTokenInfo(token).byte_offset();
|
|
|
+ return token_infos_.Get(token).byte_offset();
|
|
|
}
|
|
|
|
|
|
// Returns true if the token comes after the comment.
|
|
|
@@ -299,167 +300,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
|
|
|
int indent;
|
|
|
};
|
|
|
|
|
|
- // Storage for the information about a specific token in the buffer.
|
|
|
- //
|
|
|
- // This provides a friendly accessor API to the carefully space-optimized
|
|
|
- // storage model of the information we associated with each token.
|
|
|
- //
|
|
|
- // There are four pieces of information stored here:
|
|
|
- // - The kind of the token.
|
|
|
- // - Whether that token has leading whitespace before it.
|
|
|
- // - A kind-specific payload that can be compressed into a small integer.
|
|
|
- // - This class provides dedicated accessors for each different form of
|
|
|
- // payload that check the kind and payload correspond correctly.
|
|
|
- // - A 32-bit byte offset of the token within the source text.
|
|
|
- //
|
|
|
- // These are compressed and stored in 8-bytes for each token.
|
|
|
- //
|
|
|
- // Note that while the class provides some limited setters for payloads and
|
|
|
- // mutating methods, setters on this type may be unexpectedly expensive due to
|
|
|
- // the bit-packed representation and should be avoided. As such, only the
|
|
|
- // minimal necessary setters are provided.
|
|
|
- //
|
|
|
- // TODO: It might be worth considering a struct-of-arrays data layout in order
|
|
|
- // to move the byte offset to a separate array from the rest as it is only hot
|
|
|
- // during lexing, and then cold during parsing and semantic analysis. However,
|
|
|
- // a trivial approach to that adds more overhead than it saves due to tracking
|
|
|
- // two separate vectors and their growth. Making this profitable would likely
|
|
|
- // at least require a highly specialized single vector that manages the growth
|
|
|
- // once and then provides separate storage areas for the two arrays.
|
|
|
- class TokenInfo {
|
|
|
- public:
|
|
|
- // The kind for this token.
|
|
|
- auto kind() const -> TokenKind { return kind_; }
|
|
|
-
|
|
|
- // Whether this token is preceded by whitespace. We only store the preceding
|
|
|
- // state, and look at the next token to check for trailing whitespace.
|
|
|
- auto has_leading_space() const -> bool { return has_leading_space_; }
|
|
|
-
|
|
|
- // A collection of methods to access the specific payload included with
|
|
|
- // particular kinds of tokens. Only the specific payload accessor below may
|
|
|
- // be used for an info entry of a token with a particular kind, and these
|
|
|
- // check that the kind is valid. Some tokens do not include a payload at all
|
|
|
- // and none of these methods may be called.
|
|
|
- auto ident_id() const -> IdentifierId {
|
|
|
- CARBON_DCHECK(kind() == TokenKind::Identifier);
|
|
|
- return IdentifierId(token_payload_);
|
|
|
- }
|
|
|
- auto set_ident_id(IdentifierId ident_id) -> void {
|
|
|
- CARBON_DCHECK(kind() == TokenKind::Identifier);
|
|
|
- token_payload_ = ident_id.index;
|
|
|
- }
|
|
|
-
|
|
|
- auto string_literal_id() const -> StringLiteralValueId {
|
|
|
- CARBON_DCHECK(kind() == TokenKind::StringLiteral);
|
|
|
- return StringLiteralValueId(token_payload_);
|
|
|
- }
|
|
|
-
|
|
|
- auto int_id() const -> IntId {
|
|
|
- CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
|
|
|
- kind() == TokenKind::IntTypeLiteral ||
|
|
|
- kind() == TokenKind::UnsignedIntTypeLiteral ||
|
|
|
- kind() == TokenKind::FloatTypeLiteral);
|
|
|
- return IntId::MakeFromTokenPayload(token_payload_);
|
|
|
- }
|
|
|
-
|
|
|
- auto real_id() const -> RealId {
|
|
|
- CARBON_DCHECK(kind() == TokenKind::RealLiteral);
|
|
|
- return RealId(token_payload_);
|
|
|
- }
|
|
|
-
|
|
|
- auto closing_token_index() const -> TokenIndex {
|
|
|
- CARBON_DCHECK(kind().is_opening_symbol());
|
|
|
- return TokenIndex(token_payload_);
|
|
|
- }
|
|
|
- auto set_closing_token_index(TokenIndex closing_index) -> void {
|
|
|
- CARBON_DCHECK(kind().is_opening_symbol());
|
|
|
- token_payload_ = closing_index.index;
|
|
|
- }
|
|
|
-
|
|
|
- auto opening_token_index() const -> TokenIndex {
|
|
|
- CARBON_DCHECK(kind().is_closing_symbol());
|
|
|
- return TokenIndex(token_payload_);
|
|
|
- }
|
|
|
- auto set_opening_token_index(TokenIndex opening_index) -> void {
|
|
|
- CARBON_DCHECK(kind().is_closing_symbol());
|
|
|
- token_payload_ = opening_index.index;
|
|
|
- }
|
|
|
-
|
|
|
- auto error_length() const -> int {
|
|
|
- CARBON_DCHECK(kind() == TokenKind::Error);
|
|
|
- return token_payload_;
|
|
|
- }
|
|
|
-
|
|
|
- // Zero-based byte offset of the token within the file. This can be combined
|
|
|
- // with the buffer's line information to locate the line and column of the
|
|
|
- // token as well.
|
|
|
- auto byte_offset() const -> int32_t { return byte_offset_; }
|
|
|
-
|
|
|
- // Transforms the token into an error token of the given length but at its
|
|
|
- // original position and with the same whitespace adjacency.
|
|
|
- auto ResetAsError(int error_length) -> void {
|
|
|
- // Construct a fresh token to establish any needed invariants and replace
|
|
|
- // this token with it.
|
|
|
- TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
|
|
|
- byte_offset());
|
|
|
- *this = error;
|
|
|
- }
|
|
|
-
|
|
|
- private:
|
|
|
- friend class Lexer;
|
|
|
-
|
|
|
- static constexpr int PayloadBits = 23;
|
|
|
-
|
|
|
- // Make sure we have enough payload bits to represent token-associated IDs.
|
|
|
- static_assert(PayloadBits >= IntId::TokenIdBits);
|
|
|
- static_assert(PayloadBits >= TokenIndex::Bits);
|
|
|
-
|
|
|
- // Constructor for a TokenKind that carries no payload, or where the payload
|
|
|
- // will be set later.
|
|
|
- //
|
|
|
- // Only used by the lexer which enforces only the correct kinds are used.
|
|
|
- //
|
|
|
- // When the payload is not being set, we leave it uninitialized. At least in
|
|
|
- // some cases, this will allow MSan to correctly detect erroneous attempts
|
|
|
- // to access the payload, as it works to track uninitialized memory
|
|
|
- // bit-for-bit specifically to handle complex cases like bitfields.
|
|
|
- TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
|
|
|
- : kind_(kind),
|
|
|
- has_leading_space_(has_leading_space),
|
|
|
- byte_offset_(byte_offset) {}
|
|
|
-
|
|
|
- // Constructor for a TokenKind that carries a payload.
|
|
|
- //
|
|
|
- // Only used by the lexer which enforces the correct kind and payload types.
|
|
|
- TokenInfo(TokenKind kind, bool has_leading_space, int payload,
|
|
|
- int32_t byte_offset)
|
|
|
- : kind_(kind),
|
|
|
- has_leading_space_(has_leading_space),
|
|
|
- token_payload_(payload),
|
|
|
- byte_offset_(byte_offset) {}
|
|
|
-
|
|
|
- // A bitfield that encodes the token's kind, the leading space flag, and the
|
|
|
- // remaining bits in a payload. These are encoded together as a bitfield for
|
|
|
- // density and because these are the hottest fields of tokens for consumers
|
|
|
- // after lexing.
|
|
|
- //
|
|
|
- // Payload values are typically ID types for which we create at most one per
|
|
|
- // token, so we ensure that `token_payload_` is large enough to fit any
|
|
|
- // token index. Stores to this field may overflow, but we produce an error
|
|
|
- // in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens,
|
|
|
- // so this value never overflows if lexing succeeds.
|
|
|
- TokenKind kind_;
|
|
|
- static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits");
|
|
|
- bool has_leading_space_ : 1;
|
|
|
- unsigned token_payload_ : PayloadBits;
|
|
|
-
|
|
|
- // Separate storage for the byte offset, this is hot while lexing but then
|
|
|
- // generally cold.
|
|
|
- int32_t byte_offset_;
|
|
|
- };
|
|
|
- static_assert(sizeof(TokenInfo) == 8,
|
|
|
- "Expected `TokenInfo` to pack to an 8-byte structure.");
|
|
|
-
|
|
|
// The constructor is merely responsible for trivial initialization of
|
|
|
// members. A working object of this type is built with `Lex::Lex` so that its
|
|
|
// return can indicate if an error was encountered while lexing.
|
|
|
@@ -469,9 +309,10 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
|
|
|
: value_stores_(&value_stores), source_(&source) {}
|
|
|
|
|
|
auto FindLineIndex(int32_t byte_offset) const -> LineIndex;
|
|
|
- auto GetTokenInfo(TokenIndex token) -> TokenInfo&;
|
|
|
- auto GetTokenInfo(TokenIndex token) const -> const TokenInfo&;
|
|
|
+
|
|
|
+ // Adds the token and adjusts the expected tree size.
|
|
|
auto AddToken(TokenInfo info) -> TokenIndex;
|
|
|
+
|
|
|
auto GetTokenPrintWidths(TokenIndex token) const -> PrintWidths;
|
|
|
auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
|
|
|
PrintWidths widths) const -> void;
|
|
|
@@ -486,7 +327,7 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
|
|
|
SharedValueStores* value_stores_;
|
|
|
SourceBuffer* source_;
|
|
|
|
|
|
- llvm::SmallVector<TokenInfo> token_infos_;
|
|
|
+ ValueStore<TokenIndex> token_infos_;
|
|
|
|
|
|
ValueStore<LineIndex> line_infos_;
|
|
|
|
|
|
@@ -514,35 +355,24 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
|
|
|
};
|
|
|
|
|
|
inline auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
|
|
|
- return GetTokenInfo(token).kind();
|
|
|
+ return token_infos_.Get(token).kind();
|
|
|
}
|
|
|
|
|
|
inline auto TokenizedBuffer::HasLeadingWhitespace(TokenIndex token) const
|
|
|
-> bool {
|
|
|
- return GetTokenInfo(token).has_leading_space();
|
|
|
+ return token_infos_.Get(token).has_leading_space();
|
|
|
}
|
|
|
|
|
|
inline auto TokenizedBuffer::HasTrailingWhitespace(TokenIndex token) const
|
|
|
-> bool {
|
|
|
TokenIterator it(token);
|
|
|
++it;
|
|
|
- return it != tokens().end() && GetTokenInfo(*it).has_leading_space();
|
|
|
-}
|
|
|
-
|
|
|
-inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) -> TokenInfo& {
|
|
|
- return token_infos_[token.index];
|
|
|
-}
|
|
|
-
|
|
|
-inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) const
|
|
|
- -> const TokenInfo& {
|
|
|
- return token_infos_[token.index];
|
|
|
+ return it != tokens().end() && token_infos_.Get(*it).has_leading_space();
|
|
|
}
|
|
|
|
|
|
inline auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
|
|
|
- TokenIndex index(token_infos_.size());
|
|
|
- token_infos_.push_back(info);
|
|
|
expected_max_parse_tree_size_ += info.kind().expected_max_parse_tree_size();
|
|
|
- return index;
|
|
|
+ return token_infos_.Add(info);
|
|
|
}
|
|
|
|
|
|
} // namespace Carbon::Lex
|