1: | <?php declare(strict_types=1); |
2: | |
3: | namespace PhpParser; |
4: | |
5: | use PhpParser\Parser\Tokens; |
6: | |
7: | class Lexer |
8: | { |
9: | protected $code; |
10: | protected $tokens; |
11: | protected $pos; |
12: | protected $line; |
13: | protected $filePos; |
14: | protected $prevCloseTagHasNewline; |
15: | |
16: | protected $tokenMap; |
17: | protected $dropTokens; |
18: | protected $identifierTokens; |
19: | |
20: | private $attributeStartLineUsed; |
21: | private $attributeEndLineUsed; |
22: | private $attributeStartTokenPosUsed; |
23: | private $attributeEndTokenPosUsed; |
24: | private $attributeStartFilePosUsed; |
25: | private $attributeEndFilePosUsed; |
26: | private $attributeCommentsUsed; |
27: | |
28: | |
29: | |
30: | |
31: | |
32: | |
33: | |
34: | |
35: | |
36: | |
37: | public function __construct(array $options = []) { |
38: | |
39: | $this->defineCompatibilityTokens(); |
40: | $this->tokenMap = $this->createTokenMap(); |
41: | $this->identifierTokens = $this->createIdentifierTokenMap(); |
42: | |
43: | |
44: | |
45: | $this->dropTokens = array_fill_keys( |
46: | [\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT, \T_BAD_CHARACTER], 1 |
47: | ); |
48: | |
49: | $defaultAttributes = ['comments', 'startLine', 'endLine']; |
50: | $usedAttributes = array_fill_keys($options['usedAttributes'] ?? $defaultAttributes, true); |
51: | |
52: | |
53: | $this->attributeStartLineUsed = isset($usedAttributes['startLine']); |
54: | $this->attributeEndLineUsed = isset($usedAttributes['endLine']); |
55: | $this->attributeStartTokenPosUsed = isset($usedAttributes['startTokenPos']); |
56: | $this->attributeEndTokenPosUsed = isset($usedAttributes['endTokenPos']); |
57: | $this->attributeStartFilePosUsed = isset($usedAttributes['startFilePos']); |
58: | $this->attributeEndFilePosUsed = isset($usedAttributes['endFilePos']); |
59: | $this->attributeCommentsUsed = isset($usedAttributes['comments']); |
60: | } |
61: | |
62: | |
63: | |
64: | |
65: | |
66: | |
67: | |
68: | |
69: | |
70: | |
71: | |
72: | public function startLexing(string $code, ErrorHandler $errorHandler = null) { |
73: | if (null === $errorHandler) { |
74: | $errorHandler = new ErrorHandler\Throwing(); |
75: | } |
76: | |
77: | $this->code = $code; |
78: | $this->pos = -1; |
79: | $this->line = 1; |
80: | $this->filePos = 0; |
81: | |
82: | |
83: | |
84: | $this->prevCloseTagHasNewline = true; |
85: | |
86: | $scream = ini_set('xdebug.scream', '0'); |
87: | |
88: | $this->tokens = @token_get_all($code); |
89: | $this->postprocessTokens($errorHandler); |
90: | |
91: | if (false !== $scream) { |
92: | ini_set('xdebug.scream', $scream); |
93: | } |
94: | } |
95: | |
96: | private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $errorHandler) { |
97: | $tokens = []; |
98: | for ($i = $start; $i < $end; $i++) { |
99: | $chr = $this->code[$i]; |
100: | if ($chr === "\0") { |
101: | |
102: | $errorMsg = 'Unexpected null byte'; |
103: | } else { |
104: | $errorMsg = sprintf( |
105: | 'Unexpected character "%s" (ASCII %d)', $chr, ord($chr) |
106: | ); |
107: | } |
108: | |
109: | $tokens[] = [\T_BAD_CHARACTER, $chr, $line]; |
110: | $errorHandler->handleError(new Error($errorMsg, [ |
111: | 'startLine' => $line, |
112: | 'endLine' => $line, |
113: | 'startFilePos' => $i, |
114: | 'endFilePos' => $i, |
115: | ])); |
116: | } |
117: | return $tokens; |
118: | } |
119: | |
120: | |
121: | |
122: | |
123: | |
124: | |
125: | private function isUnterminatedComment($token) : bool { |
126: | return ($token[0] === \T_COMMENT || $token[0] === \T_DOC_COMMENT) |
127: | && substr($token[1], 0, 2) === '/*' |
128: | && substr($token[1], -2) !== '*/'; |
129: | } |
130: | |
131: | protected function postprocessTokens(ErrorHandler $errorHandler) { |
132: | |
133: | |
134: | |
135: | |
136: | |
137: | |
138: | |
139: | |
140: | |
141: | |
142: | |
143: | $filePos = 0; |
144: | $line = 1; |
145: | $numTokens = \count($this->tokens); |
146: | for ($i = 0; $i < $numTokens; $i++) { |
147: | $token = $this->tokens[$i]; |
148: | |
149: | |
150: | |
151: | if ($token[0] === \T_BAD_CHARACTER) { |
152: | $this->handleInvalidCharacterRange($filePos, $filePos + 1, $line, $errorHandler); |
153: | } |
154: | |
155: | if ($token[0] === \T_COMMENT && substr($token[1], 0, 2) !== '/*' |
156: | && preg_match('/(\r\n|\n|\r)$/D', $token[1], $matches)) { |
157: | $trailingNewline = $matches[0]; |
158: | $token[1] = substr($token[1], 0, -strlen($trailingNewline)); |
159: | $this->tokens[$i] = $token; |
160: | if (isset($this->tokens[$i + 1]) && $this->tokens[$i + 1][0] === \T_WHITESPACE) { |
161: | |
162: | $this->tokens[$i + 1][1] = $trailingNewline . $this->tokens[$i + 1][1]; |
163: | $this->tokens[$i + 1][2]--; |
164: | } else { |
165: | |
166: | array_splice($this->tokens, $i + 1, 0, [ |
167: | [\T_WHITESPACE, $trailingNewline, $line], |
168: | ]); |
169: | $numTokens++; |
170: | } |
171: | } |
172: | |
173: | |
174: | |
175: | if (\is_array($token) |
176: | && ($token[0] === \T_NS_SEPARATOR || isset($this->identifierTokens[$token[0]]))) { |
177: | $lastWasSeparator = $token[0] === \T_NS_SEPARATOR; |
178: | $text = $token[1]; |
179: | for ($j = $i + 1; isset($this->tokens[$j]); $j++) { |
180: | if ($lastWasSeparator) { |
181: | if (!isset($this->identifierTokens[$this->tokens[$j][0]])) { |
182: | break; |
183: | } |
184: | $lastWasSeparator = false; |
185: | } else { |
186: | if ($this->tokens[$j][0] !== \T_NS_SEPARATOR) { |
187: | break; |
188: | } |
189: | $lastWasSeparator = true; |
190: | } |
191: | $text .= $this->tokens[$j][1]; |
192: | } |
193: | if ($lastWasSeparator) { |
194: | |
195: | $j--; |
196: | $text = substr($text, 0, -1); |
197: | } |
198: | if ($j > $i + 1) { |
199: | if ($token[0] === \T_NS_SEPARATOR) { |
200: | $type = \T_NAME_FULLY_QUALIFIED; |
201: | } else if ($token[0] === \T_NAMESPACE) { |
202: | $type = \T_NAME_RELATIVE; |
203: | } else { |
204: | $type = \T_NAME_QUALIFIED; |
205: | } |
206: | $token = [$type, $text, $line]; |
207: | array_splice($this->tokens, $i, $j - $i, [$token]); |
208: | $numTokens -= $j - $i - 1; |
209: | } |
210: | } |
211: | |
212: | if ($token === '&') { |
213: | $next = $i + 1; |
214: | while (isset($this->tokens[$next]) && $this->tokens[$next][0] === \T_WHITESPACE) { |
215: | $next++; |
216: | } |
217: | $followedByVarOrVarArg = isset($this->tokens[$next]) && |
218: | ($this->tokens[$next][0] === \T_VARIABLE || $this->tokens[$next][0] === \T_ELLIPSIS); |
219: | $this->tokens[$i] = $token = [ |
220: | $followedByVarOrVarArg |
221: | ? \T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG |
222: | : \T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG, |
223: | '&', |
224: | $line, |
225: | ]; |
226: | } |
227: | |
228: | $tokenValue = \is_string($token) ? $token : $token[1]; |
229: | $tokenLen = \strlen($tokenValue); |
230: | |
231: | if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) { |
232: | |
233: | $nextFilePos = strpos($this->code, $tokenValue, $filePos); |
234: | $badCharTokens = $this->handleInvalidCharacterRange( |
235: | $filePos, $nextFilePos, $line, $errorHandler); |
236: | $filePos = (int) $nextFilePos; |
237: | |
238: | array_splice($this->tokens, $i, 0, $badCharTokens); |
239: | $numTokens += \count($badCharTokens); |
240: | $i += \count($badCharTokens); |
241: | } |
242: | |
243: | $filePos += $tokenLen; |
244: | $line += substr_count($tokenValue, "\n"); |
245: | } |
246: | |
247: | if ($filePos !== \strlen($this->code)) { |
248: | if (substr($this->code, $filePos, 2) === '/*') { |
249: | |
250: | $comment = substr($this->code, $filePos); |
251: | $errorHandler->handleError(new Error('Unterminated comment', [ |
252: | 'startLine' => $line, |
253: | 'endLine' => $line + substr_count($comment, "\n"), |
254: | 'startFilePos' => $filePos, |
255: | 'endFilePos' => $filePos + \strlen($comment), |
256: | ])); |
257: | |
258: | |
259: | $isDocComment = isset($comment[3]) && $comment[3] === '*'; |
260: | $this->tokens[] = [$isDocComment ? \T_DOC_COMMENT : \T_COMMENT, $comment, $line]; |
261: | } else { |
262: | |
263: | $badCharTokens = $this->handleInvalidCharacterRange( |
264: | $filePos, \strlen($this->code), $line, $errorHandler); |
265: | $this->tokens = array_merge($this->tokens, $badCharTokens); |
266: | } |
267: | return; |
268: | } |
269: | |
270: | if (count($this->tokens) > 0) { |
271: | |
272: | $lastToken = $this->tokens[count($this->tokens) - 1]; |
273: | if ($this->isUnterminatedComment($lastToken)) { |
274: | $errorHandler->handleError(new Error('Unterminated comment', [ |
275: | 'startLine' => $line - substr_count($lastToken[1], "\n"), |
276: | 'endLine' => $line, |
277: | 'startFilePos' => $filePos - \strlen($lastToken[1]), |
278: | 'endFilePos' => $filePos, |
279: | ])); |
280: | } |
281: | } |
282: | } |
283: | |
284: | |
285: | |
286: | |
287: | |
288: | |
289: | |
290: | |
291: | |
292: | |
293: | |
294: | |
295: | |
296: | |
297: | |
298: | |
299: | |
300: | |
301: | |
302: | |
303: | |
304: | |
305: | |
306: | public function getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null) : int { |
307: | $startAttributes = []; |
308: | $endAttributes = []; |
309: | |
310: | while (1) { |
311: | if (isset($this->tokens[++$this->pos])) { |
312: | $token = $this->tokens[$this->pos]; |
313: | } else { |
314: | |
315: | $token = "\0"; |
316: | } |
317: | |
318: | if ($this->attributeStartLineUsed) { |
319: | $startAttributes['startLine'] = $this->line; |
320: | } |
321: | if ($this->attributeStartTokenPosUsed) { |
322: | $startAttributes['startTokenPos'] = $this->pos; |
323: | } |
324: | if ($this->attributeStartFilePosUsed) { |
325: | $startAttributes['startFilePos'] = $this->filePos; |
326: | } |
327: | |
328: | if (\is_string($token)) { |
329: | $value = $token; |
330: | if (isset($token[1])) { |
331: | |
332: | $this->filePos += 2; |
333: | $id = ord('"'); |
334: | } else { |
335: | $this->filePos += 1; |
336: | $id = ord($token); |
337: | } |
338: | } elseif (!isset($this->dropTokens[$token[0]])) { |
339: | $value = $token[1]; |
340: | $id = $this->tokenMap[$token[0]]; |
341: | if (\T_CLOSE_TAG === $token[0]) { |
342: | $this->prevCloseTagHasNewline = false !== strpos($token[1], "\n") |
343: | || false !== strpos($token[1], "\r"); |
344: | } elseif (\T_INLINE_HTML === $token[0]) { |
345: | $startAttributes['hasLeadingNewline'] = $this->prevCloseTagHasNewline; |
346: | } |
347: | |
348: | $this->line += substr_count($value, "\n"); |
349: | $this->filePos += \strlen($value); |
350: | } else { |
351: | $origLine = $this->line; |
352: | $origFilePos = $this->filePos; |
353: | $this->line += substr_count($token[1], "\n"); |
354: | $this->filePos += \strlen($token[1]); |
355: | |
356: | if (\T_COMMENT === $token[0] || \T_DOC_COMMENT === $token[0]) { |
357: | if ($this->attributeCommentsUsed) { |
358: | $comment = \T_DOC_COMMENT === $token[0] |
359: | ? new Comment\Doc($token[1], |
360: | $origLine, $origFilePos, $this->pos, |
361: | $this->line, $this->filePos - 1, $this->pos) |
362: | : new Comment($token[1], |
363: | $origLine, $origFilePos, $this->pos, |
364: | $this->line, $this->filePos - 1, $this->pos); |
365: | $startAttributes['comments'][] = $comment; |
366: | } |
367: | } |
368: | continue; |
369: | } |
370: | |
371: | if ($this->attributeEndLineUsed) { |
372: | $endAttributes['endLine'] = $this->line; |
373: | } |
374: | if ($this->attributeEndTokenPosUsed) { |
375: | $endAttributes['endTokenPos'] = $this->pos; |
376: | } |
377: | if ($this->attributeEndFilePosUsed) { |
378: | $endAttributes['endFilePos'] = $this->filePos - 1; |
379: | } |
380: | |
381: | return $id; |
382: | } |
383: | |
384: | throw new \RuntimeException('Reached end of lexer loop'); |
385: | } |
386: | |
387: | |
388: | |
389: | |
390: | |
391: | |
392: | |
393: | |
394: | |
395: | |
396: | |
397: | public function getTokens() : array { |
398: | return $this->tokens; |
399: | } |
400: | |
401: | |
402: | |
403: | |
404: | |
405: | |
406: | public function handleHaltCompiler() : string { |
407: | |
408: | $textAfter = substr($this->code, $this->filePos); |
409: | |
410: | |
411: | |
412: | |
413: | if (!preg_match('~^\s*\(\s*\)\s*(?:;|\?>\r?\n?)~', $textAfter, $matches)) { |
414: | throw new Error('__HALT_COMPILER must be followed by "();"'); |
415: | } |
416: | |
417: | |
418: | $this->pos = count($this->tokens); |
419: | |
420: | |
421: | return substr($textAfter, strlen($matches[0])); |
422: | } |
423: | |
424: | private function defineCompatibilityTokens() { |
425: | static $compatTokensDefined = false; |
426: | if ($compatTokensDefined) { |
427: | return; |
428: | } |
429: | |
430: | $compatTokens = [ |
431: | |
432: | 'T_BAD_CHARACTER', |
433: | 'T_FN', |
434: | 'T_COALESCE_EQUAL', |
435: | |
436: | 'T_NAME_QUALIFIED', |
437: | 'T_NAME_FULLY_QUALIFIED', |
438: | 'T_NAME_RELATIVE', |
439: | 'T_MATCH', |
440: | 'T_NULLSAFE_OBJECT_OPERATOR', |
441: | 'T_ATTRIBUTE', |
442: | |
443: | 'T_ENUM', |
444: | 'T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG', |
445: | 'T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG', |
446: | 'T_READONLY', |
447: | ]; |
448: | |
449: | |
450: | |
451: | |
452: | $usedTokenIds = []; |
453: | foreach ($compatTokens as $token) { |
454: | if (\defined($token)) { |
455: | $tokenId = \constant($token); |
456: | $clashingToken = $usedTokenIds[$tokenId] ?? null; |
457: | if ($clashingToken !== null) { |
458: | throw new \Error(sprintf( |
459: | 'Token %s has same ID as token %s, ' . |
460: | 'you may be using a library with broken token emulation', |
461: | $token, $clashingToken |
462: | )); |
463: | } |
464: | $usedTokenIds[$tokenId] = $token; |
465: | } |
466: | } |
467: | |
468: | |
469: | |
470: | $newTokenId = -1; |
471: | foreach ($compatTokens as $token) { |
472: | if (!\defined($token)) { |
473: | while (isset($usedTokenIds[$newTokenId])) { |
474: | $newTokenId--; |
475: | } |
476: | \define($token, $newTokenId); |
477: | $newTokenId--; |
478: | } |
479: | } |
480: | |
481: | $compatTokensDefined = true; |
482: | } |
483: | |
484: | |
485: | |
486: | |
487: | |
488: | |
489: | |
490: | |
491: | |
492: | |
493: | protected function createTokenMap() : array { |
494: | $tokenMap = []; |
495: | |
496: | |
497: | |
498: | for ($i = 256; $i < 1000; ++$i) { |
499: | if (\T_DOUBLE_COLON === $i) { |
500: | |
501: | $tokenMap[$i] = Tokens::T_PAAMAYIM_NEKUDOTAYIM; |
502: | } elseif(\T_OPEN_TAG_WITH_ECHO === $i) { |
503: | |
504: | $tokenMap[$i] = Tokens::T_ECHO; |
505: | } elseif(\T_CLOSE_TAG === $i) { |
506: | |
507: | $tokenMap[$i] = ord(';'); |
508: | } elseif ('UNKNOWN' !== $name = token_name($i)) { |
509: | if ('T_HASHBANG' === $name) { |
510: | |
511: | $tokenMap[$i] = Tokens::T_INLINE_HTML; |
512: | } elseif (defined($name = Tokens::class . '::' . $name)) { |
513: | |
514: | $tokenMap[$i] = constant($name); |
515: | } |
516: | } |
517: | } |
518: | |
519: | |
520: | if (defined('T_ONUMBER')) { |
521: | $tokenMap[\T_ONUMBER] = Tokens::T_DNUMBER; |
522: | } |
523: | |
524: | if (defined('T_COMPILER_HALT_OFFSET')) { |
525: | $tokenMap[\T_COMPILER_HALT_OFFSET] = Tokens::T_STRING; |
526: | } |
527: | |
528: | |
529: | $tokenMap[\T_FN] = Tokens::T_FN; |
530: | $tokenMap[\T_COALESCE_EQUAL] = Tokens::T_COALESCE_EQUAL; |
531: | $tokenMap[\T_NAME_QUALIFIED] = Tokens::T_NAME_QUALIFIED; |
532: | $tokenMap[\T_NAME_FULLY_QUALIFIED] = Tokens::T_NAME_FULLY_QUALIFIED; |
533: | $tokenMap[\T_NAME_RELATIVE] = Tokens::T_NAME_RELATIVE; |
534: | $tokenMap[\T_MATCH] = Tokens::T_MATCH; |
535: | $tokenMap[\T_NULLSAFE_OBJECT_OPERATOR] = Tokens::T_NULLSAFE_OBJECT_OPERATOR; |
536: | $tokenMap[\T_ATTRIBUTE] = Tokens::T_ATTRIBUTE; |
537: | $tokenMap[\T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG] = Tokens::T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG; |
538: | $tokenMap[\T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG] = Tokens::T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG; |
539: | $tokenMap[\T_ENUM] = Tokens::T_ENUM; |
540: | $tokenMap[\T_READONLY] = Tokens::T_READONLY; |
541: | |
542: | return $tokenMap; |
543: | } |
544: | |
545: | private function createIdentifierTokenMap(): array { |
546: | |
547: | return array_fill_keys([ |
548: | \T_STRING, |
549: | \T_STATIC, \T_ABSTRACT, \T_FINAL, \T_PRIVATE, \T_PROTECTED, \T_PUBLIC, \T_READONLY, |
550: | \T_INCLUDE, \T_INCLUDE_ONCE, \T_EVAL, \T_REQUIRE, \T_REQUIRE_ONCE, \T_LOGICAL_OR, \T_LOGICAL_XOR, \T_LOGICAL_AND, |
551: | \T_INSTANCEOF, \T_NEW, \T_CLONE, \T_EXIT, \T_IF, \T_ELSEIF, \T_ELSE, \T_ENDIF, \T_ECHO, \T_DO, \T_WHILE, |
552: | \T_ENDWHILE, \T_FOR, \T_ENDFOR, \T_FOREACH, \T_ENDFOREACH, \T_DECLARE, \T_ENDDECLARE, \T_AS, \T_TRY, \T_CATCH, |
553: | \T_FINALLY, \T_THROW, \T_USE, \T_INSTEADOF, \T_GLOBAL, \T_VAR, \T_UNSET, \T_ISSET, \T_EMPTY, \T_CONTINUE, \T_GOTO, |
554: | \T_FUNCTION, \T_CONST, \T_RETURN, \T_PRINT, \T_YIELD, \T_LIST, \T_SWITCH, \T_ENDSWITCH, \T_CASE, \T_DEFAULT, |
555: | \T_BREAK, \T_ARRAY, \T_CALLABLE, \T_EXTENDS, \T_IMPLEMENTS, \T_NAMESPACE, \T_TRAIT, \T_INTERFACE, \T_CLASS, |
556: | \T_CLASS_C, \T_TRAIT_C, \T_FUNC_C, \T_METHOD_C, \T_LINE, \T_FILE, \T_DIR, \T_NS_C, \T_HALT_COMPILER, \T_FN, |
557: | \T_MATCH, |
558: | ], true); |
559: | } |
560: | } |
561: | |