1: <?php declare(strict_types = 1);
2:
3: namespace PHPStan\PhpDocParser\Lexer;
4:
5: use function implode;
6: use function preg_match_all;
7: use const PREG_SET_ORDER;
8:
9: /**
10: * Implementation based on Nette Tokenizer (New BSD License; https://github.com/nette/tokenizer)
11: */
12: class Lexer
13: {
14:
15: public const TOKEN_REFERENCE = 0;
16: public const TOKEN_UNION = 1;
17: public const TOKEN_INTERSECTION = 2;
18: public const TOKEN_NULLABLE = 3;
19: public const TOKEN_OPEN_PARENTHESES = 4;
20: public const TOKEN_CLOSE_PARENTHESES = 5;
21: public const TOKEN_OPEN_ANGLE_BRACKET = 6;
22: public const TOKEN_CLOSE_ANGLE_BRACKET = 7;
23: public const TOKEN_OPEN_SQUARE_BRACKET = 8;
24: public const TOKEN_CLOSE_SQUARE_BRACKET = 9;
25: public const TOKEN_COMMA = 10;
26: public const TOKEN_VARIADIC = 11;
27: public const TOKEN_DOUBLE_COLON = 12;
28: public const TOKEN_DOUBLE_ARROW = 13;
29: public const TOKEN_EQUAL = 14;
30: public const TOKEN_OPEN_PHPDOC = 15;
31: public const TOKEN_CLOSE_PHPDOC = 16;
32: public const TOKEN_PHPDOC_TAG = 17;
33: public const TOKEN_FLOAT = 18;
34: public const TOKEN_INTEGER = 19;
35: public const TOKEN_SINGLE_QUOTED_STRING = 20;
36: public const TOKEN_DOUBLE_QUOTED_STRING = 21;
37: public const TOKEN_IDENTIFIER = 22;
38: public const TOKEN_THIS_VARIABLE = 23;
39: public const TOKEN_VARIABLE = 24;
40: public const TOKEN_HORIZONTAL_WS = 25;
41: public const TOKEN_PHPDOC_EOL = 26;
42: public const TOKEN_OTHER = 27;
43: public const TOKEN_END = 28;
44: public const TOKEN_COLON = 29;
45: public const TOKEN_WILDCARD = 30;
46: public const TOKEN_OPEN_CURLY_BRACKET = 31;
47: public const TOKEN_CLOSE_CURLY_BRACKET = 32;
48: public const TOKEN_NEGATED = 33;
49: public const TOKEN_ARROW = 34;
50:
51: public const TOKEN_LABELS = [
52: self::TOKEN_REFERENCE => '\'&\'',
53: self::TOKEN_UNION => '\'|\'',
54: self::TOKEN_INTERSECTION => '\'&\'',
55: self::TOKEN_NULLABLE => '\'?\'',
56: self::TOKEN_NEGATED => '\'!\'',
57: self::TOKEN_OPEN_PARENTHESES => '\'(\'',
58: self::TOKEN_CLOSE_PARENTHESES => '\')\'',
59: self::TOKEN_OPEN_ANGLE_BRACKET => '\'<\'',
60: self::TOKEN_CLOSE_ANGLE_BRACKET => '\'>\'',
61: self::TOKEN_OPEN_SQUARE_BRACKET => '\'[\'',
62: self::TOKEN_CLOSE_SQUARE_BRACKET => '\']\'',
63: self::TOKEN_OPEN_CURLY_BRACKET => '\'{\'',
64: self::TOKEN_CLOSE_CURLY_BRACKET => '\'}\'',
65: self::TOKEN_COMMA => '\',\'',
66: self::TOKEN_COLON => '\':\'',
67: self::TOKEN_VARIADIC => '\'...\'',
68: self::TOKEN_DOUBLE_COLON => '\'::\'',
69: self::TOKEN_DOUBLE_ARROW => '\'=>\'',
70: self::TOKEN_ARROW => '\'->\'',
71: self::TOKEN_EQUAL => '\'=\'',
72: self::TOKEN_OPEN_PHPDOC => '\'/**\'',
73: self::TOKEN_CLOSE_PHPDOC => '\'*/\'',
74: self::TOKEN_PHPDOC_TAG => 'TOKEN_PHPDOC_TAG',
75: self::TOKEN_PHPDOC_EOL => 'TOKEN_PHPDOC_EOL',
76: self::TOKEN_FLOAT => 'TOKEN_FLOAT',
77: self::TOKEN_INTEGER => 'TOKEN_INTEGER',
78: self::TOKEN_SINGLE_QUOTED_STRING => 'TOKEN_SINGLE_QUOTED_STRING',
79: self::TOKEN_DOUBLE_QUOTED_STRING => 'TOKEN_DOUBLE_QUOTED_STRING',
80: self::TOKEN_IDENTIFIER => 'type',
81: self::TOKEN_THIS_VARIABLE => '\'$this\'',
82: self::TOKEN_VARIABLE => 'variable',
83: self::TOKEN_HORIZONTAL_WS => 'TOKEN_HORIZONTAL_WS',
84: self::TOKEN_OTHER => 'TOKEN_OTHER',
85: self::TOKEN_END => 'TOKEN_END',
86: self::TOKEN_WILDCARD => '*',
87: ];
88:
89: public const VALUE_OFFSET = 0;
90: public const TYPE_OFFSET = 1;
91:
92: /** @var string|null */
93: private $regexp;
94:
95: public function tokenize(string $s): array
96: {
97: if ($this->regexp === null) {
98: $this->regexp = $this->generateRegexp();
99: }
100:
101: preg_match_all($this->regexp, $s, $matches, PREG_SET_ORDER);
102:
103: $tokens = [];
104: foreach ($matches as $match) {
105: $tokens[] = [$match[0], (int) $match['MARK']];
106: }
107:
108: $tokens[] = ['', self::TOKEN_END];
109:
110: return $tokens;
111: }
112:
113:
114: private function generateRegexp(): string
115: {
116: $patterns = [
117: self::TOKEN_HORIZONTAL_WS => '[\\x09\\x20]++',
118:
119: self::TOKEN_IDENTIFIER => '(?:[\\\\]?+[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF-]*+)++',
120: self::TOKEN_THIS_VARIABLE => '\\$this(?![0-9a-z_\\x80-\\xFF])',
121: self::TOKEN_VARIABLE => '\\$[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF]*+',
122:
123: // '&' followed by TOKEN_VARIADIC, TOKEN_VARIABLE, TOKEN_EQUAL, TOKEN_EQUAL or TOKEN_CLOSE_PARENTHESES
124: self::TOKEN_REFERENCE => '&(?=\\s*+(?:[.,=)]|(?:\\$(?!this(?![0-9a-z_\\x80-\\xFF])))))',
125: self::TOKEN_UNION => '\\|',
126: self::TOKEN_INTERSECTION => '&',
127: self::TOKEN_NULLABLE => '\\?',
128: self::TOKEN_NEGATED => '!',
129:
130: self::TOKEN_OPEN_PARENTHESES => '\\(',
131: self::TOKEN_CLOSE_PARENTHESES => '\\)',
132: self::TOKEN_OPEN_ANGLE_BRACKET => '<',
133: self::TOKEN_CLOSE_ANGLE_BRACKET => '>',
134: self::TOKEN_OPEN_SQUARE_BRACKET => '\\[',
135: self::TOKEN_CLOSE_SQUARE_BRACKET => '\\]',
136: self::TOKEN_OPEN_CURLY_BRACKET => '\\{',
137: self::TOKEN_CLOSE_CURLY_BRACKET => '\\}',
138:
139: self::TOKEN_COMMA => ',',
140: self::TOKEN_VARIADIC => '\\.\\.\\.',
141: self::TOKEN_DOUBLE_COLON => '::',
142: self::TOKEN_DOUBLE_ARROW => '=>',
143: self::TOKEN_ARROW => '->',
144: self::TOKEN_EQUAL => '=',
145: self::TOKEN_COLON => ':',
146:
147: self::TOKEN_OPEN_PHPDOC => '/\\*\\*(?=\\s)\\x20?+',
148: self::TOKEN_CLOSE_PHPDOC => '\\*/',
149: self::TOKEN_PHPDOC_TAG => '@(?:[a-z][a-z0-9-\\\\]+:)?[a-z][a-z0-9-\\\\]*+',
150: self::TOKEN_PHPDOC_EOL => '\\r?+\\n[\\x09\\x20]*+(?:\\*(?!/)\\x20?+)?',
151:
152: self::TOKEN_FLOAT => '(?:-?[0-9]++\\.[0-9]*+(?:e-?[0-9]++)?)|(?:-?[0-9]*+\\.[0-9]++(?:e-?[0-9]++)?)|(?:-?[0-9]++e-?[0-9]++)',
153: self::TOKEN_INTEGER => '-?(?:(?:0b[0-1]++)|(?:0o[0-7]++)|(?:0x[0-9a-f]++)|(?:[0-9]++))',
154: self::TOKEN_SINGLE_QUOTED_STRING => '\'(?:\\\\[^\\r\\n]|[^\'\\r\\n\\\\])*+\'',
155: self::TOKEN_DOUBLE_QUOTED_STRING => '"(?:\\\\[^\\r\\n]|[^"\\r\\n\\\\])*+"',
156:
157: self::TOKEN_WILDCARD => '\\*',
158:
159: // anything but TOKEN_CLOSE_PHPDOC or TOKEN_HORIZONTAL_WS or TOKEN_EOL
160: self::TOKEN_OTHER => '(?:(?!\\*/)[^\\s])++',
161: ];
162:
163: foreach ($patterns as $type => &$pattern) {
164: $pattern = '(?:' . $pattern . ')(*MARK:' . $type . ')';
165: }
166:
167: return '~' . implode('|', $patterns) . '~Asi';
168: }
169:
170: }
171: