1: <?php declare(strict_types=1);
2:
3: namespace PhpParser;
4:
5: require __DIR__ . '/compatibility_tokens.php';
6:
7: class Lexer {
8: /**
9: * Tokenize the provided source code.
10: *
11: * The token array is in the same format as provided by the PhpToken::tokenize() method in
12: * PHP 8.0. The tokens are instances of PhpParser\Token, to abstract over a polyfill
13: * implementation in earlier PHP version.
14: *
15: * The token array is terminated by a sentinel token with token ID 0.
16: * The token array does not discard any tokens (i.e. whitespace and comments are included).
17: * The token position attributes are against this token array.
18: *
19: * @param string $code The source code to tokenize.
20: * @param ErrorHandler|null $errorHandler Error handler to use for lexing errors. Defaults to
21: * ErrorHandler\Throwing.
22: * @return Token[] Tokens
23: */
24: public function tokenize(string $code, ?ErrorHandler $errorHandler = null): array {
25: if (null === $errorHandler) {
26: $errorHandler = new ErrorHandler\Throwing();
27: }
28:
29: $scream = ini_set('xdebug.scream', '0');
30:
31: $tokens = @Token::tokenize($code);
32: $this->postprocessTokens($tokens, $errorHandler);
33:
34: if (false !== $scream) {
35: ini_set('xdebug.scream', $scream);
36: }
37:
38: return $tokens;
39: }
40:
41: private function handleInvalidCharacter(Token $token, ErrorHandler $errorHandler): void {
42: $chr = $token->text;
43: if ($chr === "\0") {
44: // PHP cuts error message after null byte, so need special case
45: $errorMsg = 'Unexpected null byte';
46: } else {
47: $errorMsg = sprintf(
48: 'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
49: );
50: }
51:
52: $errorHandler->handleError(new Error($errorMsg, [
53: 'startLine' => $token->line,
54: 'endLine' => $token->line,
55: 'startFilePos' => $token->pos,
56: 'endFilePos' => $token->pos,
57: ]));
58: }
59:
60: private function isUnterminatedComment(Token $token): bool {
61: return $token->is([\T_COMMENT, \T_DOC_COMMENT])
62: && substr($token->text, 0, 2) === '/*'
63: && substr($token->text, -2) !== '*/';
64: }
65:
66: /**
67: * @param list<Token> $tokens
68: */
69: protected function postprocessTokens(array &$tokens, ErrorHandler $errorHandler): void {
70: // This function reports errors (bad characters and unterminated comments) in the token
71: // array, and performs certain canonicalizations:
72: // * Use PHP 8.1 T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG and
73: // T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG tokens used to disambiguate intersection types.
74: // * Add a sentinel token with ID 0.
75:
76: $numTokens = \count($tokens);
77: if ($numTokens === 0) {
78: // Empty input edge case: Just add the sentinel token.
79: $tokens[] = new Token(0, "\0", 1, 0);
80: return;
81: }
82:
83: for ($i = 0; $i < $numTokens; $i++) {
84: $token = $tokens[$i];
85: if ($token->id === \T_BAD_CHARACTER) {
86: $this->handleInvalidCharacter($token, $errorHandler);
87: }
88:
89: if ($token->id === \ord('&')) {
90: $next = $i + 1;
91: while (isset($tokens[$next]) && $tokens[$next]->id === \T_WHITESPACE) {
92: $next++;
93: }
94: $followedByVarOrVarArg = isset($tokens[$next]) &&
95: $tokens[$next]->is([\T_VARIABLE, \T_ELLIPSIS]);
96: $token->id = $followedByVarOrVarArg
97: ? \T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG
98: : \T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG;
99: }
100: }
101:
102: // Check for unterminated comment
103: $lastToken = $tokens[$numTokens - 1];
104: if ($this->isUnterminatedComment($lastToken)) {
105: $errorHandler->handleError(new Error('Unterminated comment', [
106: 'startLine' => $lastToken->line,
107: 'endLine' => $lastToken->getEndLine(),
108: 'startFilePos' => $lastToken->pos,
109: 'endFilePos' => $lastToken->getEndPos(),
110: ]));
111: }
112:
113: // Add sentinel token.
114: $tokens[] = new Token(0, "\0", $lastToken->getEndLine(), $lastToken->getEndPos());
115: }
116: }
117: