1: <?php declare(strict_types=1);
2:
3: namespace PhpParser;
4:
5: use PhpParser\Parser\Tokens;
6:
7: class Lexer
8: {
9: protected $code;
10: protected $tokens;
11: protected $pos;
12: protected $line;
13: protected $filePos;
14: protected $prevCloseTagHasNewline;
15:
16: protected $tokenMap;
17: protected $dropTokens;
18: protected $identifierTokens;
19:
20: private $attributeStartLineUsed;
21: private $attributeEndLineUsed;
22: private $attributeStartTokenPosUsed;
23: private $attributeEndTokenPosUsed;
24: private $attributeStartFilePosUsed;
25: private $attributeEndFilePosUsed;
26: private $attributeCommentsUsed;
27:
28: /**
29: * Creates a Lexer.
30: *
31: * @param array $options Options array. Currently only the 'usedAttributes' option is supported,
32: * which is an array of attributes to add to the AST nodes. Possible
33: * attributes are: 'comments', 'startLine', 'endLine', 'startTokenPos',
34: * 'endTokenPos', 'startFilePos', 'endFilePos'. The option defaults to the
35: * first three. For more info see getNextToken() docs.
36: */
37: public function __construct(array $options = []) {
38: // Create Map from internal tokens to PhpParser tokens.
39: $this->defineCompatibilityTokens();
40: $this->tokenMap = $this->createTokenMap();
41: $this->identifierTokens = $this->createIdentifierTokenMap();
42:
43: // map of tokens to drop while lexing (the map is only used for isset lookup,
44: // that's why the value is simply set to 1; the value is never actually used.)
45: $this->dropTokens = array_fill_keys(
46: [\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT, \T_BAD_CHARACTER], 1
47: );
48:
49: $defaultAttributes = ['comments', 'startLine', 'endLine'];
50: $usedAttributes = array_fill_keys($options['usedAttributes'] ?? $defaultAttributes, true);
51:
52: // Create individual boolean properties to make these checks faster.
53: $this->attributeStartLineUsed = isset($usedAttributes['startLine']);
54: $this->attributeEndLineUsed = isset($usedAttributes['endLine']);
55: $this->attributeStartTokenPosUsed = isset($usedAttributes['startTokenPos']);
56: $this->attributeEndTokenPosUsed = isset($usedAttributes['endTokenPos']);
57: $this->attributeStartFilePosUsed = isset($usedAttributes['startFilePos']);
58: $this->attributeEndFilePosUsed = isset($usedAttributes['endFilePos']);
59: $this->attributeCommentsUsed = isset($usedAttributes['comments']);
60: }
61:
62: /**
63: * Initializes the lexer for lexing the provided source code.
64: *
65: * This function does not throw if lexing errors occur. Instead, errors may be retrieved using
66: * the getErrors() method.
67: *
68: * @param string $code The source code to lex
69: * @param ErrorHandler|null $errorHandler Error handler to use for lexing errors. Defaults to
70: * ErrorHandler\Throwing
71: */
72: public function startLexing(string $code, ErrorHandler $errorHandler = null) {
73: if (null === $errorHandler) {
74: $errorHandler = new ErrorHandler\Throwing();
75: }
76:
77: $this->code = $code; // keep the code around for __halt_compiler() handling
78: $this->pos = -1;
79: $this->line = 1;
80: $this->filePos = 0;
81:
82: // If inline HTML occurs without preceding code, treat it as if it had a leading newline.
83: // This ensures proper composability, because having a newline is the "safe" assumption.
84: $this->prevCloseTagHasNewline = true;
85:
86: $scream = ini_set('xdebug.scream', '0');
87:
88: $this->tokens = @token_get_all($code);
89: $this->postprocessTokens($errorHandler);
90:
91: if (false !== $scream) {
92: ini_set('xdebug.scream', $scream);
93: }
94: }
95:
96: private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $errorHandler) {
97: $tokens = [];
98: for ($i = $start; $i < $end; $i++) {
99: $chr = $this->code[$i];
100: if ($chr === "\0") {
101: // PHP cuts error message after null byte, so need special case
102: $errorMsg = 'Unexpected null byte';
103: } else {
104: $errorMsg = sprintf(
105: 'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
106: );
107: }
108:
109: $tokens[] = [\T_BAD_CHARACTER, $chr, $line];
110: $errorHandler->handleError(new Error($errorMsg, [
111: 'startLine' => $line,
112: 'endLine' => $line,
113: 'startFilePos' => $i,
114: 'endFilePos' => $i,
115: ]));
116: }
117: return $tokens;
118: }
119:
120: /**
121: * Check whether comment token is unterminated.
122: *
123: * @return bool
124: */
125: private function isUnterminatedComment($token) : bool {
126: return ($token[0] === \T_COMMENT || $token[0] === \T_DOC_COMMENT)
127: && substr($token[1], 0, 2) === '/*'
128: && substr($token[1], -2) !== '*/';
129: }
130:
131: protected function postprocessTokens(ErrorHandler $errorHandler) {
132: // PHP's error handling for token_get_all() is rather bad, so if we want detailed
133: // error information we need to compute it ourselves. Invalid character errors are
134: // detected by finding "gaps" in the token array. Unterminated comments are detected
135: // by checking if a trailing comment has a "*/" at the end.
136: //
137: // Additionally, we perform a number of canonicalizations here:
138: // * Use the PHP 8.0 comment format, which does not include trailing whitespace anymore.
139: // * Use PHP 8.0 T_NAME_* tokens.
140: // * Use PHP 8.1 T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG and
141: // T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG tokens used to disambiguate intersection types.
142:
143: $filePos = 0;
144: $line = 1;
145: $numTokens = \count($this->tokens);
146: for ($i = 0; $i < $numTokens; $i++) {
147: $token = $this->tokens[$i];
148:
149: // Since PHP 7.4 invalid characters are represented by a T_BAD_CHARACTER token.
150: // In this case we only need to emit an error.
151: if ($token[0] === \T_BAD_CHARACTER) {
152: $this->handleInvalidCharacterRange($filePos, $filePos + 1, $line, $errorHandler);
153: }
154:
155: if ($token[0] === \T_COMMENT && substr($token[1], 0, 2) !== '/*'
156: && preg_match('/(\r\n|\n|\r)$/D', $token[1], $matches)) {
157: $trailingNewline = $matches[0];
158: $token[1] = substr($token[1], 0, -strlen($trailingNewline));
159: $this->tokens[$i] = $token;
160: if (isset($this->tokens[$i + 1]) && $this->tokens[$i + 1][0] === \T_WHITESPACE) {
161: // Move trailing newline into following T_WHITESPACE token, if it already exists.
162: $this->tokens[$i + 1][1] = $trailingNewline . $this->tokens[$i + 1][1];
163: $this->tokens[$i + 1][2]--;
164: } else {
165: // Otherwise, we need to create a new T_WHITESPACE token.
166: array_splice($this->tokens, $i + 1, 0, [
167: [\T_WHITESPACE, $trailingNewline, $line],
168: ]);
169: $numTokens++;
170: }
171: }
172:
173: // Emulate PHP 8 T_NAME_* tokens, by combining sequences of T_NS_SEPARATOR and T_STRING
174: // into a single token.
175: if (\is_array($token)
176: && ($token[0] === \T_NS_SEPARATOR || isset($this->identifierTokens[$token[0]]))) {
177: $lastWasSeparator = $token[0] === \T_NS_SEPARATOR;
178: $text = $token[1];
179: for ($j = $i + 1; isset($this->tokens[$j]); $j++) {
180: if ($lastWasSeparator) {
181: if (!isset($this->identifierTokens[$this->tokens[$j][0]])) {
182: break;
183: }
184: $lastWasSeparator = false;
185: } else {
186: if ($this->tokens[$j][0] !== \T_NS_SEPARATOR) {
187: break;
188: }
189: $lastWasSeparator = true;
190: }
191: $text .= $this->tokens[$j][1];
192: }
193: if ($lastWasSeparator) {
194: // Trailing separator is not part of the name.
195: $j--;
196: $text = substr($text, 0, -1);
197: }
198: if ($j > $i + 1) {
199: if ($token[0] === \T_NS_SEPARATOR) {
200: $type = \T_NAME_FULLY_QUALIFIED;
201: } else if ($token[0] === \T_NAMESPACE) {
202: $type = \T_NAME_RELATIVE;
203: } else {
204: $type = \T_NAME_QUALIFIED;
205: }
206: $token = [$type, $text, $line];
207: array_splice($this->tokens, $i, $j - $i, [$token]);
208: $numTokens -= $j - $i - 1;
209: }
210: }
211:
212: if ($token === '&') {
213: $next = $i + 1;
214: while (isset($this->tokens[$next]) && $this->tokens[$next][0] === \T_WHITESPACE) {
215: $next++;
216: }
217: $followedByVarOrVarArg = isset($this->tokens[$next]) &&
218: ($this->tokens[$next][0] === \T_VARIABLE || $this->tokens[$next][0] === \T_ELLIPSIS);
219: $this->tokens[$i] = $token = [
220: $followedByVarOrVarArg
221: ? \T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG
222: : \T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG,
223: '&',
224: $line,
225: ];
226: }
227:
228: $tokenValue = \is_string($token) ? $token : $token[1];
229: $tokenLen = \strlen($tokenValue);
230:
231: if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) {
232: // Something is missing, must be an invalid character
233: $nextFilePos = strpos($this->code, $tokenValue, $filePos);
234: $badCharTokens = $this->handleInvalidCharacterRange(
235: $filePos, $nextFilePos, $line, $errorHandler);
236: $filePos = (int) $nextFilePos;
237:
238: array_splice($this->tokens, $i, 0, $badCharTokens);
239: $numTokens += \count($badCharTokens);
240: $i += \count($badCharTokens);
241: }
242:
243: $filePos += $tokenLen;
244: $line += substr_count($tokenValue, "\n");
245: }
246:
247: if ($filePos !== \strlen($this->code)) {
248: if (substr($this->code, $filePos, 2) === '/*') {
249: // Unlike PHP, HHVM will drop unterminated comments entirely
250: $comment = substr($this->code, $filePos);
251: $errorHandler->handleError(new Error('Unterminated comment', [
252: 'startLine' => $line,
253: 'endLine' => $line + substr_count($comment, "\n"),
254: 'startFilePos' => $filePos,
255: 'endFilePos' => $filePos + \strlen($comment),
256: ]));
257:
258: // Emulate the PHP behavior
259: $isDocComment = isset($comment[3]) && $comment[3] === '*';
260: $this->tokens[] = [$isDocComment ? \T_DOC_COMMENT : \T_COMMENT, $comment, $line];
261: } else {
262: // Invalid characters at the end of the input
263: $badCharTokens = $this->handleInvalidCharacterRange(
264: $filePos, \strlen($this->code), $line, $errorHandler);
265: $this->tokens = array_merge($this->tokens, $badCharTokens);
266: }
267: return;
268: }
269:
270: if (count($this->tokens) > 0) {
271: // Check for unterminated comment
272: $lastToken = $this->tokens[count($this->tokens) - 1];
273: if ($this->isUnterminatedComment($lastToken)) {
274: $errorHandler->handleError(new Error('Unterminated comment', [
275: 'startLine' => $line - substr_count($lastToken[1], "\n"),
276: 'endLine' => $line,
277: 'startFilePos' => $filePos - \strlen($lastToken[1]),
278: 'endFilePos' => $filePos,
279: ]));
280: }
281: }
282: }
283:
284: /**
285: * Fetches the next token.
286: *
287: * The available attributes are determined by the 'usedAttributes' option, which can
288: * be specified in the constructor. The following attributes are supported:
289: *
290: * * 'comments' => Array of PhpParser\Comment or PhpParser\Comment\Doc instances,
291: * representing all comments that occurred between the previous
292: * non-discarded token and the current one.
293: * * 'startLine' => Line in which the node starts.
294: * * 'endLine' => Line in which the node ends.
295: * * 'startTokenPos' => Offset into the token array of the first token in the node.
296: * * 'endTokenPos' => Offset into the token array of the last token in the node.
297: * * 'startFilePos' => Offset into the code string of the first character that is part of the node.
298: * * 'endFilePos' => Offset into the code string of the last character that is part of the node.
299: *
300: * @param mixed $value Variable to store token content in
301: * @param mixed $startAttributes Variable to store start attributes in
302: * @param mixed $endAttributes Variable to store end attributes in
303: *
304: * @return int Token id
305: */
306: public function getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null) : int {
307: $startAttributes = [];
308: $endAttributes = [];
309:
310: while (1) {
311: if (isset($this->tokens[++$this->pos])) {
312: $token = $this->tokens[$this->pos];
313: } else {
314: // EOF token with ID 0
315: $token = "\0";
316: }
317:
318: if ($this->attributeStartLineUsed) {
319: $startAttributes['startLine'] = $this->line;
320: }
321: if ($this->attributeStartTokenPosUsed) {
322: $startAttributes['startTokenPos'] = $this->pos;
323: }
324: if ($this->attributeStartFilePosUsed) {
325: $startAttributes['startFilePos'] = $this->filePos;
326: }
327:
328: if (\is_string($token)) {
329: $value = $token;
330: if (isset($token[1])) {
331: // bug in token_get_all
332: $this->filePos += 2;
333: $id = ord('"');
334: } else {
335: $this->filePos += 1;
336: $id = ord($token);
337: }
338: } elseif (!isset($this->dropTokens[$token[0]])) {
339: $value = $token[1];
340: $id = $this->tokenMap[$token[0]];
341: if (\T_CLOSE_TAG === $token[0]) {
342: $this->prevCloseTagHasNewline = false !== strpos($token[1], "\n")
343: || false !== strpos($token[1], "\r");
344: } elseif (\T_INLINE_HTML === $token[0]) {
345: $startAttributes['hasLeadingNewline'] = $this->prevCloseTagHasNewline;
346: }
347:
348: $this->line += substr_count($value, "\n");
349: $this->filePos += \strlen($value);
350: } else {
351: $origLine = $this->line;
352: $origFilePos = $this->filePos;
353: $this->line += substr_count($token[1], "\n");
354: $this->filePos += \strlen($token[1]);
355:
356: if (\T_COMMENT === $token[0] || \T_DOC_COMMENT === $token[0]) {
357: if ($this->attributeCommentsUsed) {
358: $comment = \T_DOC_COMMENT === $token[0]
359: ? new Comment\Doc($token[1],
360: $origLine, $origFilePos, $this->pos,
361: $this->line, $this->filePos - 1, $this->pos)
362: : new Comment($token[1],
363: $origLine, $origFilePos, $this->pos,
364: $this->line, $this->filePos - 1, $this->pos);
365: $startAttributes['comments'][] = $comment;
366: }
367: }
368: continue;
369: }
370:
371: if ($this->attributeEndLineUsed) {
372: $endAttributes['endLine'] = $this->line;
373: }
374: if ($this->attributeEndTokenPosUsed) {
375: $endAttributes['endTokenPos'] = $this->pos;
376: }
377: if ($this->attributeEndFilePosUsed) {
378: $endAttributes['endFilePos'] = $this->filePos - 1;
379: }
380:
381: return $id;
382: }
383:
384: throw new \RuntimeException('Reached end of lexer loop');
385: }
386:
387: /**
388: * Returns the token array for current code.
389: *
390: * The token array is in the same format as provided by the
391: * token_get_all() function and does not discard tokens (i.e.
392: * whitespace and comments are included). The token position
393: * attributes are against this token array.
394: *
395: * @return array Array of tokens in token_get_all() format
396: */
397: public function getTokens() : array {
398: return $this->tokens;
399: }
400:
401: /**
402: * Handles __halt_compiler() by returning the text after it.
403: *
404: * @return string Remaining text
405: */
406: public function handleHaltCompiler() : string {
407: // text after T_HALT_COMPILER, still including ();
408: $textAfter = substr($this->code, $this->filePos);
409:
410: // ensure that it is followed by ();
411: // this simplifies the situation, by not allowing any comments
412: // in between of the tokens.
413: if (!preg_match('~^\s*\(\s*\)\s*(?:;|\?>\r?\n?)~', $textAfter, $matches)) {
414: throw new Error('__HALT_COMPILER must be followed by "();"');
415: }
416:
417: // prevent the lexer from returning any further tokens
418: $this->pos = count($this->tokens);
419:
420: // return with (); removed
421: return substr($textAfter, strlen($matches[0]));
422: }
423:
424: private function defineCompatibilityTokens() {
425: static $compatTokensDefined = false;
426: if ($compatTokensDefined) {
427: return;
428: }
429:
430: $compatTokens = [
431: // PHP 7.4
432: 'T_BAD_CHARACTER',
433: 'T_FN',
434: 'T_COALESCE_EQUAL',
435: // PHP 8.0
436: 'T_NAME_QUALIFIED',
437: 'T_NAME_FULLY_QUALIFIED',
438: 'T_NAME_RELATIVE',
439: 'T_MATCH',
440: 'T_NULLSAFE_OBJECT_OPERATOR',
441: 'T_ATTRIBUTE',
442: // PHP 8.1
443: 'T_ENUM',
444: 'T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG',
445: 'T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG',
446: 'T_READONLY',
447: ];
448:
449: // PHP-Parser might be used together with another library that also emulates some or all
450: // of these tokens. Perform a sanity-check that all already defined tokens have been
451: // assigned a unique ID.
452: $usedTokenIds = [];
453: foreach ($compatTokens as $token) {
454: if (\defined($token)) {
455: $tokenId = \constant($token);
456: $clashingToken = $usedTokenIds[$tokenId] ?? null;
457: if ($clashingToken !== null) {
458: throw new \Error(sprintf(
459: 'Token %s has same ID as token %s, ' .
460: 'you may be using a library with broken token emulation',
461: $token, $clashingToken
462: ));
463: }
464: $usedTokenIds[$tokenId] = $token;
465: }
466: }
467:
468: // Now define any tokens that have not yet been emulated. Try to assign IDs from -1
469: // downwards, but skip any IDs that may already be in use.
470: $newTokenId = -1;
471: foreach ($compatTokens as $token) {
472: if (!\defined($token)) {
473: while (isset($usedTokenIds[$newTokenId])) {
474: $newTokenId--;
475: }
476: \define($token, $newTokenId);
477: $newTokenId--;
478: }
479: }
480:
481: $compatTokensDefined = true;
482: }
483:
484: /**
485: * Creates the token map.
486: *
487: * The token map maps the PHP internal token identifiers
488: * to the identifiers used by the Parser. Additionally it
489: * maps T_OPEN_TAG_WITH_ECHO to T_ECHO and T_CLOSE_TAG to ';'.
490: *
491: * @return array The token map
492: */
493: protected function createTokenMap() : array {
494: $tokenMap = [];
495:
496: // 256 is the minimum possible token number, as everything below
497: // it is an ASCII value
498: for ($i = 256; $i < 1000; ++$i) {
499: if (\T_DOUBLE_COLON === $i) {
500: // T_DOUBLE_COLON is equivalent to T_PAAMAYIM_NEKUDOTAYIM
501: $tokenMap[$i] = Tokens::T_PAAMAYIM_NEKUDOTAYIM;
502: } elseif(\T_OPEN_TAG_WITH_ECHO === $i) {
503: // T_OPEN_TAG_WITH_ECHO with dropped T_OPEN_TAG results in T_ECHO
504: $tokenMap[$i] = Tokens::T_ECHO;
505: } elseif(\T_CLOSE_TAG === $i) {
506: // T_CLOSE_TAG is equivalent to ';'
507: $tokenMap[$i] = ord(';');
508: } elseif ('UNKNOWN' !== $name = token_name($i)) {
509: if ('T_HASHBANG' === $name) {
510: // HHVM uses a special token for #! hashbang lines
511: $tokenMap[$i] = Tokens::T_INLINE_HTML;
512: } elseif (defined($name = Tokens::class . '::' . $name)) {
513: // Other tokens can be mapped directly
514: $tokenMap[$i] = constant($name);
515: }
516: }
517: }
518:
519: // HHVM uses a special token for numbers that overflow to double
520: if (defined('T_ONUMBER')) {
521: $tokenMap[\T_ONUMBER] = Tokens::T_DNUMBER;
522: }
523: // HHVM also has a separate token for the __COMPILER_HALT_OFFSET__ constant
524: if (defined('T_COMPILER_HALT_OFFSET')) {
525: $tokenMap[\T_COMPILER_HALT_OFFSET] = Tokens::T_STRING;
526: }
527:
528: // Assign tokens for which we define compatibility constants, as token_name() does not know them.
529: $tokenMap[\T_FN] = Tokens::T_FN;
530: $tokenMap[\T_COALESCE_EQUAL] = Tokens::T_COALESCE_EQUAL;
531: $tokenMap[\T_NAME_QUALIFIED] = Tokens::T_NAME_QUALIFIED;
532: $tokenMap[\T_NAME_FULLY_QUALIFIED] = Tokens::T_NAME_FULLY_QUALIFIED;
533: $tokenMap[\T_NAME_RELATIVE] = Tokens::T_NAME_RELATIVE;
534: $tokenMap[\T_MATCH] = Tokens::T_MATCH;
535: $tokenMap[\T_NULLSAFE_OBJECT_OPERATOR] = Tokens::T_NULLSAFE_OBJECT_OPERATOR;
536: $tokenMap[\T_ATTRIBUTE] = Tokens::T_ATTRIBUTE;
537: $tokenMap[\T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG] = Tokens::T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG;
538: $tokenMap[\T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG] = Tokens::T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG;
539: $tokenMap[\T_ENUM] = Tokens::T_ENUM;
540: $tokenMap[\T_READONLY] = Tokens::T_READONLY;
541:
542: return $tokenMap;
543: }
544:
545: private function createIdentifierTokenMap(): array {
546: // Based on semi_reserved production.
547: return array_fill_keys([
548: \T_STRING,
549: \T_STATIC, \T_ABSTRACT, \T_FINAL, \T_PRIVATE, \T_PROTECTED, \T_PUBLIC, \T_READONLY,
550: \T_INCLUDE, \T_INCLUDE_ONCE, \T_EVAL, \T_REQUIRE, \T_REQUIRE_ONCE, \T_LOGICAL_OR, \T_LOGICAL_XOR, \T_LOGICAL_AND,
551: \T_INSTANCEOF, \T_NEW, \T_CLONE, \T_EXIT, \T_IF, \T_ELSEIF, \T_ELSE, \T_ENDIF, \T_ECHO, \T_DO, \T_WHILE,
552: \T_ENDWHILE, \T_FOR, \T_ENDFOR, \T_FOREACH, \T_ENDFOREACH, \T_DECLARE, \T_ENDDECLARE, \T_AS, \T_TRY, \T_CATCH,
553: \T_FINALLY, \T_THROW, \T_USE, \T_INSTEADOF, \T_GLOBAL, \T_VAR, \T_UNSET, \T_ISSET, \T_EMPTY, \T_CONTINUE, \T_GOTO,
554: \T_FUNCTION, \T_CONST, \T_RETURN, \T_PRINT, \T_YIELD, \T_LIST, \T_SWITCH, \T_ENDSWITCH, \T_CASE, \T_DEFAULT,
555: \T_BREAK, \T_ARRAY, \T_CALLABLE, \T_EXTENDS, \T_IMPLEMENTS, \T_NAMESPACE, \T_TRAIT, \T_INTERFACE, \T_CLASS,
556: \T_CLASS_C, \T_TRAIT_C, \T_FUNC_C, \T_METHOD_C, \T_LINE, \T_FILE, \T_DIR, \T_NS_C, \T_HALT_COMPILER, \T_FN,
557: \T_MATCH,
558: ], true);
559: }
560: }
561: