1: <?php declare(strict_types = 1);
2:
3: namespace PHPStan\PhpDocParser\Lexer;
4:
5: use function implode;
6: use function preg_match_all;
7: use const PREG_SET_ORDER;
8:
9: /**
10: * Implementation based on Nette Tokenizer (New BSD License; https://github.com/nette/tokenizer)
11: */
12: class Lexer
13: {
14:
15: public const TOKEN_REFERENCE = 0;
16: public const TOKEN_UNION = 1;
17: public const TOKEN_INTERSECTION = 2;
18: public const TOKEN_NULLABLE = 3;
19: public const TOKEN_OPEN_PARENTHESES = 4;
20: public const TOKEN_CLOSE_PARENTHESES = 5;
21: public const TOKEN_OPEN_ANGLE_BRACKET = 6;
22: public const TOKEN_CLOSE_ANGLE_BRACKET = 7;
23: public const TOKEN_OPEN_SQUARE_BRACKET = 8;
24: public const TOKEN_CLOSE_SQUARE_BRACKET = 9;
25: public const TOKEN_COMMA = 10;
26: public const TOKEN_VARIADIC = 11;
27: public const TOKEN_DOUBLE_COLON = 12;
28: public const TOKEN_DOUBLE_ARROW = 13;
29: public const TOKEN_EQUAL = 14;
30: public const TOKEN_OPEN_PHPDOC = 15;
31: public const TOKEN_CLOSE_PHPDOC = 16;
32: public const TOKEN_PHPDOC_TAG = 17;
33: public const TOKEN_DOCTRINE_TAG = 18;
34: public const TOKEN_FLOAT = 19;
35: public const TOKEN_INTEGER = 20;
36: public const TOKEN_SINGLE_QUOTED_STRING = 21;
37: public const TOKEN_DOUBLE_QUOTED_STRING = 22;
38: public const TOKEN_DOCTRINE_ANNOTATION_STRING = 23;
39: public const TOKEN_IDENTIFIER = 24;
40: public const TOKEN_THIS_VARIABLE = 25;
41: public const TOKEN_VARIABLE = 26;
42: public const TOKEN_HORIZONTAL_WS = 27;
43: public const TOKEN_PHPDOC_EOL = 28;
44: public const TOKEN_OTHER = 29;
45: public const TOKEN_END = 30;
46: public const TOKEN_COLON = 31;
47: public const TOKEN_WILDCARD = 32;
48: public const TOKEN_OPEN_CURLY_BRACKET = 33;
49: public const TOKEN_CLOSE_CURLY_BRACKET = 34;
50: public const TOKEN_NEGATED = 35;
51: public const TOKEN_ARROW = 36;
52:
53: public const TOKEN_LABELS = [
54: self::TOKEN_REFERENCE => '\'&\'',
55: self::TOKEN_UNION => '\'|\'',
56: self::TOKEN_INTERSECTION => '\'&\'',
57: self::TOKEN_NULLABLE => '\'?\'',
58: self::TOKEN_NEGATED => '\'!\'',
59: self::TOKEN_OPEN_PARENTHESES => '\'(\'',
60: self::TOKEN_CLOSE_PARENTHESES => '\')\'',
61: self::TOKEN_OPEN_ANGLE_BRACKET => '\'<\'',
62: self::TOKEN_CLOSE_ANGLE_BRACKET => '\'>\'',
63: self::TOKEN_OPEN_SQUARE_BRACKET => '\'[\'',
64: self::TOKEN_CLOSE_SQUARE_BRACKET => '\']\'',
65: self::TOKEN_OPEN_CURLY_BRACKET => '\'{\'',
66: self::TOKEN_CLOSE_CURLY_BRACKET => '\'}\'',
67: self::TOKEN_COMMA => '\',\'',
68: self::TOKEN_COLON => '\':\'',
69: self::TOKEN_VARIADIC => '\'...\'',
70: self::TOKEN_DOUBLE_COLON => '\'::\'',
71: self::TOKEN_DOUBLE_ARROW => '\'=>\'',
72: self::TOKEN_ARROW => '\'->\'',
73: self::TOKEN_EQUAL => '\'=\'',
74: self::TOKEN_OPEN_PHPDOC => '\'/**\'',
75: self::TOKEN_CLOSE_PHPDOC => '\'*/\'',
76: self::TOKEN_PHPDOC_TAG => 'TOKEN_PHPDOC_TAG',
77: self::TOKEN_DOCTRINE_TAG => 'TOKEN_DOCTRINE_TAG',
78: self::TOKEN_PHPDOC_EOL => 'TOKEN_PHPDOC_EOL',
79: self::TOKEN_FLOAT => 'TOKEN_FLOAT',
80: self::TOKEN_INTEGER => 'TOKEN_INTEGER',
81: self::TOKEN_SINGLE_QUOTED_STRING => 'TOKEN_SINGLE_QUOTED_STRING',
82: self::TOKEN_DOUBLE_QUOTED_STRING => 'TOKEN_DOUBLE_QUOTED_STRING',
83: self::TOKEN_DOCTRINE_ANNOTATION_STRING => 'TOKEN_DOCTRINE_ANNOTATION_STRING',
84: self::TOKEN_IDENTIFIER => 'type',
85: self::TOKEN_THIS_VARIABLE => '\'$this\'',
86: self::TOKEN_VARIABLE => 'variable',
87: self::TOKEN_HORIZONTAL_WS => 'TOKEN_HORIZONTAL_WS',
88: self::TOKEN_OTHER => 'TOKEN_OTHER',
89: self::TOKEN_END => 'TOKEN_END',
90: self::TOKEN_WILDCARD => '*',
91: ];
92:
93: public const VALUE_OFFSET = 0;
94: public const TYPE_OFFSET = 1;
95: public const LINE_OFFSET = 2;
96:
97: /** @var bool */
98: private $parseDoctrineAnnotations;
99:
100: /** @var string|null */
101: private $regexp;
102:
103: public function __construct(bool $parseDoctrineAnnotations = false)
104: {
105: $this->parseDoctrineAnnotations = $parseDoctrineAnnotations;
106: }
107:
108: /**
109: * @return list<array{string, int, int}>
110: */
111: public function tokenize(string $s): array
112: {
113: if ($this->regexp === null) {
114: $this->regexp = $this->generateRegexp();
115: }
116:
117: preg_match_all($this->regexp, $s, $matches, PREG_SET_ORDER);
118:
119: $tokens = [];
120: $line = 1;
121: foreach ($matches as $match) {
122: $type = (int) $match['MARK'];
123: $tokens[] = [$match[0], $type, $line];
124: if ($type !== self::TOKEN_PHPDOC_EOL) {
125: continue;
126: }
127:
128: $line++;
129: }
130:
131: $tokens[] = ['', self::TOKEN_END, $line];
132:
133: return $tokens;
134: }
135:
136:
137: private function generateRegexp(): string
138: {
139: $patterns = [
140: self::TOKEN_HORIZONTAL_WS => '[\\x09\\x20]++',
141:
142: self::TOKEN_IDENTIFIER => '(?:[\\\\]?+[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF-]*+)++',
143: self::TOKEN_THIS_VARIABLE => '\\$this(?![0-9a-z_\\x80-\\xFF])',
144: self::TOKEN_VARIABLE => '\\$[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF]*+',
145:
146: // '&' followed by TOKEN_VARIADIC, TOKEN_VARIABLE, TOKEN_EQUAL, TOKEN_EQUAL or TOKEN_CLOSE_PARENTHESES
147: self::TOKEN_REFERENCE => '&(?=\\s*+(?:[.,=)]|(?:\\$(?!this(?![0-9a-z_\\x80-\\xFF])))))',
148: self::TOKEN_UNION => '\\|',
149: self::TOKEN_INTERSECTION => '&',
150: self::TOKEN_NULLABLE => '\\?',
151: self::TOKEN_NEGATED => '!',
152:
153: self::TOKEN_OPEN_PARENTHESES => '\\(',
154: self::TOKEN_CLOSE_PARENTHESES => '\\)',
155: self::TOKEN_OPEN_ANGLE_BRACKET => '<',
156: self::TOKEN_CLOSE_ANGLE_BRACKET => '>',
157: self::TOKEN_OPEN_SQUARE_BRACKET => '\\[',
158: self::TOKEN_CLOSE_SQUARE_BRACKET => '\\]',
159: self::TOKEN_OPEN_CURLY_BRACKET => '\\{',
160: self::TOKEN_CLOSE_CURLY_BRACKET => '\\}',
161:
162: self::TOKEN_COMMA => ',',
163: self::TOKEN_VARIADIC => '\\.\\.\\.',
164: self::TOKEN_DOUBLE_COLON => '::',
165: self::TOKEN_DOUBLE_ARROW => '=>',
166: self::TOKEN_ARROW => '->',
167: self::TOKEN_EQUAL => '=',
168: self::TOKEN_COLON => ':',
169:
170: self::TOKEN_OPEN_PHPDOC => '/\\*\\*(?=\\s)\\x20?+',
171: self::TOKEN_CLOSE_PHPDOC => '\\*/',
172: self::TOKEN_PHPDOC_TAG => '@(?:[a-z][a-z0-9-\\\\]+:)?[a-z][a-z0-9-\\\\]*+',
173: self::TOKEN_PHPDOC_EOL => '\\r?+\\n[\\x09\\x20]*+(?:\\*(?!/)\\x20?+)?',
174:
175: self::TOKEN_FLOAT => '[+\-]?(?:(?:[0-9]++(_[0-9]++)*\\.[0-9]*+(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]*+(_[0-9]++)*\\.[0-9]++(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]++(_[0-9]++)*e[+\-]?[0-9]++(_[0-9]++)*))',
176: self::TOKEN_INTEGER => '[+\-]?(?:(?:0b[0-1]++(_[0-1]++)*)|(?:0o[0-7]++(_[0-7]++)*)|(?:0x[0-9a-f]++(_[0-9a-f]++)*)|(?:[0-9]++(_[0-9]++)*))',
177: self::TOKEN_SINGLE_QUOTED_STRING => '\'(?:\\\\[^\\r\\n]|[^\'\\r\\n\\\\])*+\'',
178: self::TOKEN_DOUBLE_QUOTED_STRING => '"(?:\\\\[^\\r\\n]|[^"\\r\\n\\\\])*+"',
179:
180: self::TOKEN_WILDCARD => '\\*',
181: ];
182:
183: if ($this->parseDoctrineAnnotations) {
184: $patterns[self::TOKEN_DOCTRINE_TAG] = '@[a-z_\\\\][a-z0-9_\:\\\\]*[a-z_][a-z0-9_]*';
185: $patterns[self::TOKEN_DOCTRINE_ANNOTATION_STRING] = '"(?:""|[^"])*+"';
186: }
187:
188: // anything but TOKEN_CLOSE_PHPDOC or TOKEN_HORIZONTAL_WS or TOKEN_EOL
189: $patterns[self::TOKEN_OTHER] = '(?:(?!\\*/)[^\\s])++';
190:
191: foreach ($patterns as $type => &$pattern) {
192: $pattern = '(?:' . $pattern . ')(*MARK:' . $type . ')';
193: }
194:
195: return '~' . implode('|', $patterns) . '~Asi';
196: }
197:
198: }
199: