1: | <?php declare(strict_types = 1); |
2: | |
3: | namespace PHPStan\PhpDocParser\Lexer; |
4: | |
5: | use function implode; |
6: | use function preg_match_all; |
7: | use const PREG_SET_ORDER; |
8: | |
9: | |
10: | |
11: | |
12: | class Lexer |
13: | { |
14: | |
15: | public const TOKEN_REFERENCE = 0; |
16: | public const TOKEN_UNION = 1; |
17: | public const TOKEN_INTERSECTION = 2; |
18: | public const TOKEN_NULLABLE = 3; |
19: | public const TOKEN_OPEN_PARENTHESES = 4; |
20: | public const TOKEN_CLOSE_PARENTHESES = 5; |
21: | public const TOKEN_OPEN_ANGLE_BRACKET = 6; |
22: | public const TOKEN_CLOSE_ANGLE_BRACKET = 7; |
23: | public const TOKEN_OPEN_SQUARE_BRACKET = 8; |
24: | public const TOKEN_CLOSE_SQUARE_BRACKET = 9; |
25: | public const TOKEN_COMMA = 10; |
26: | public const TOKEN_VARIADIC = 11; |
27: | public const TOKEN_DOUBLE_COLON = 12; |
28: | public const TOKEN_DOUBLE_ARROW = 13; |
29: | public const TOKEN_EQUAL = 14; |
30: | public const TOKEN_OPEN_PHPDOC = 15; |
31: | public const TOKEN_CLOSE_PHPDOC = 16; |
32: | public const TOKEN_PHPDOC_TAG = 17; |
33: | public const TOKEN_DOCTRINE_TAG = 18; |
34: | public const TOKEN_FLOAT = 19; |
35: | public const TOKEN_INTEGER = 20; |
36: | public const TOKEN_SINGLE_QUOTED_STRING = 21; |
37: | public const TOKEN_DOUBLE_QUOTED_STRING = 22; |
38: | public const TOKEN_DOCTRINE_ANNOTATION_STRING = 23; |
39: | public const TOKEN_IDENTIFIER = 24; |
40: | public const TOKEN_THIS_VARIABLE = 25; |
41: | public const TOKEN_VARIABLE = 26; |
42: | public const TOKEN_HORIZONTAL_WS = 27; |
43: | public const TOKEN_PHPDOC_EOL = 28; |
44: | public const TOKEN_OTHER = 29; |
45: | public const TOKEN_END = 30; |
46: | public const TOKEN_COLON = 31; |
47: | public const TOKEN_WILDCARD = 32; |
48: | public const TOKEN_OPEN_CURLY_BRACKET = 33; |
49: | public const TOKEN_CLOSE_CURLY_BRACKET = 34; |
50: | public const TOKEN_NEGATED = 35; |
51: | public const TOKEN_ARROW = 36; |
52: | |
53: | public const TOKEN_LABELS = [ |
54: | self::TOKEN_REFERENCE => '\'&\'', |
55: | self::TOKEN_UNION => '\'|\'', |
56: | self::TOKEN_INTERSECTION => '\'&\'', |
57: | self::TOKEN_NULLABLE => '\'?\'', |
58: | self::TOKEN_NEGATED => '\'!\'', |
59: | self::TOKEN_OPEN_PARENTHESES => '\'(\'', |
60: | self::TOKEN_CLOSE_PARENTHESES => '\')\'', |
61: | self::TOKEN_OPEN_ANGLE_BRACKET => '\'<\'', |
62: | self::TOKEN_CLOSE_ANGLE_BRACKET => '\'>\'', |
63: | self::TOKEN_OPEN_SQUARE_BRACKET => '\'[\'', |
64: | self::TOKEN_CLOSE_SQUARE_BRACKET => '\']\'', |
65: | self::TOKEN_OPEN_CURLY_BRACKET => '\'{\'', |
66: | self::TOKEN_CLOSE_CURLY_BRACKET => '\'}\'', |
67: | self::TOKEN_COMMA => '\',\'', |
68: | self::TOKEN_COLON => '\':\'', |
69: | self::TOKEN_VARIADIC => '\'...\'', |
70: | self::TOKEN_DOUBLE_COLON => '\'::\'', |
71: | self::TOKEN_DOUBLE_ARROW => '\'=>\'', |
72: | self::TOKEN_ARROW => '\'->\'', |
73: | self::TOKEN_EQUAL => '\'=\'', |
74: | self::TOKEN_OPEN_PHPDOC => '\'/**\'', |
75: | self::TOKEN_CLOSE_PHPDOC => '\'*/\'', |
76: | self::TOKEN_PHPDOC_TAG => 'TOKEN_PHPDOC_TAG', |
77: | self::TOKEN_DOCTRINE_TAG => 'TOKEN_DOCTRINE_TAG', |
78: | self::TOKEN_PHPDOC_EOL => 'TOKEN_PHPDOC_EOL', |
79: | self::TOKEN_FLOAT => 'TOKEN_FLOAT', |
80: | self::TOKEN_INTEGER => 'TOKEN_INTEGER', |
81: | self::TOKEN_SINGLE_QUOTED_STRING => 'TOKEN_SINGLE_QUOTED_STRING', |
82: | self::TOKEN_DOUBLE_QUOTED_STRING => 'TOKEN_DOUBLE_QUOTED_STRING', |
83: | self::TOKEN_DOCTRINE_ANNOTATION_STRING => 'TOKEN_DOCTRINE_ANNOTATION_STRING', |
84: | self::TOKEN_IDENTIFIER => 'type', |
85: | self::TOKEN_THIS_VARIABLE => '\'$this\'', |
86: | self::TOKEN_VARIABLE => 'variable', |
87: | self::TOKEN_HORIZONTAL_WS => 'TOKEN_HORIZONTAL_WS', |
88: | self::TOKEN_OTHER => 'TOKEN_OTHER', |
89: | self::TOKEN_END => 'TOKEN_END', |
90: | self::TOKEN_WILDCARD => '*', |
91: | ]; |
92: | |
93: | public const VALUE_OFFSET = 0; |
94: | public const TYPE_OFFSET = 1; |
95: | public const LINE_OFFSET = 2; |
96: | |
97: | |
98: | private $parseDoctrineAnnotations; |
99: | |
100: | |
101: | private $regexp; |
102: | |
103: | public function __construct(bool $parseDoctrineAnnotations = false) |
104: | { |
105: | $this->parseDoctrineAnnotations = $parseDoctrineAnnotations; |
106: | } |
107: | |
108: | |
109: | |
110: | |
111: | public function tokenize(string $s): array |
112: | { |
113: | if ($this->regexp === null) { |
114: | $this->regexp = $this->generateRegexp(); |
115: | } |
116: | |
117: | preg_match_all($this->regexp, $s, $matches, PREG_SET_ORDER); |
118: | |
119: | $tokens = []; |
120: | $line = 1; |
121: | foreach ($matches as $match) { |
122: | $type = (int) $match['MARK']; |
123: | $tokens[] = [$match[0], $type, $line]; |
124: | if ($type !== self::TOKEN_PHPDOC_EOL) { |
125: | continue; |
126: | } |
127: | |
128: | $line++; |
129: | } |
130: | |
131: | $tokens[] = ['', self::TOKEN_END, $line]; |
132: | |
133: | return $tokens; |
134: | } |
135: | |
136: | |
137: | private function generateRegexp(): string |
138: | { |
139: | $patterns = [ |
140: | self::TOKEN_HORIZONTAL_WS => '[\\x09\\x20]++', |
141: | |
142: | self::TOKEN_IDENTIFIER => '(?:[\\\\]?+[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF-]*+)++', |
143: | self::TOKEN_THIS_VARIABLE => '\\$this(?![0-9a-z_\\x80-\\xFF])', |
144: | self::TOKEN_VARIABLE => '\\$[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF]*+', |
145: | |
146: | |
147: | self::TOKEN_REFERENCE => '&(?=\\s*+(?:[.,=)]|(?:\\$(?!this(?![0-9a-z_\\x80-\\xFF])))))', |
148: | self::TOKEN_UNION => '\\|', |
149: | self::TOKEN_INTERSECTION => '&', |
150: | self::TOKEN_NULLABLE => '\\?', |
151: | self::TOKEN_NEGATED => '!', |
152: | |
153: | self::TOKEN_OPEN_PARENTHESES => '\\(', |
154: | self::TOKEN_CLOSE_PARENTHESES => '\\)', |
155: | self::TOKEN_OPEN_ANGLE_BRACKET => '<', |
156: | self::TOKEN_CLOSE_ANGLE_BRACKET => '>', |
157: | self::TOKEN_OPEN_SQUARE_BRACKET => '\\[', |
158: | self::TOKEN_CLOSE_SQUARE_BRACKET => '\\]', |
159: | self::TOKEN_OPEN_CURLY_BRACKET => '\\{', |
160: | self::TOKEN_CLOSE_CURLY_BRACKET => '\\}', |
161: | |
162: | self::TOKEN_COMMA => ',', |
163: | self::TOKEN_VARIADIC => '\\.\\.\\.', |
164: | self::TOKEN_DOUBLE_COLON => '::', |
165: | self::TOKEN_DOUBLE_ARROW => '=>', |
166: | self::TOKEN_ARROW => '->', |
167: | self::TOKEN_EQUAL => '=', |
168: | self::TOKEN_COLON => ':', |
169: | |
170: | self::TOKEN_OPEN_PHPDOC => '/\\*\\*(?=\\s)\\x20?+', |
171: | self::TOKEN_CLOSE_PHPDOC => '\\*/', |
172: | self::TOKEN_PHPDOC_TAG => '@(?:[a-z][a-z0-9-\\\\]+:)?[a-z][a-z0-9-\\\\]*+', |
173: | self::TOKEN_PHPDOC_EOL => '\\r?+\\n[\\x09\\x20]*+(?:\\*(?!/)\\x20?+)?', |
174: | |
175: | self::TOKEN_FLOAT => '[+\-]?(?:(?:[0-9]++(_[0-9]++)*\\.[0-9]*+(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]*+(_[0-9]++)*\\.[0-9]++(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]++(_[0-9]++)*e[+\-]?[0-9]++(_[0-9]++)*))', |
176: | self::TOKEN_INTEGER => '[+\-]?(?:(?:0b[0-1]++(_[0-1]++)*)|(?:0o[0-7]++(_[0-7]++)*)|(?:0x[0-9a-f]++(_[0-9a-f]++)*)|(?:[0-9]++(_[0-9]++)*))', |
177: | self::TOKEN_SINGLE_QUOTED_STRING => '\'(?:\\\\[^\\r\\n]|[^\'\\r\\n\\\\])*+\'', |
178: | self::TOKEN_DOUBLE_QUOTED_STRING => '"(?:\\\\[^\\r\\n]|[^"\\r\\n\\\\])*+"', |
179: | |
180: | self::TOKEN_WILDCARD => '\\*', |
181: | ]; |
182: | |
183: | if ($this->parseDoctrineAnnotations) { |
184: | $patterns[self::TOKEN_DOCTRINE_TAG] = '@[a-z_\\\\][a-z0-9_\:\\\\]*[a-z_][a-z0-9_]*'; |
185: | $patterns[self::TOKEN_DOCTRINE_ANNOTATION_STRING] = '"(?:""|[^"])*+"'; |
186: | } |
187: | |
188: | |
189: | $patterns[self::TOKEN_OTHER] = '(?:(?!\\*/)[^\\s])++'; |
190: | |
191: | foreach ($patterns as $type => &$pattern) { |
192: | $pattern = '(?:' . $pattern . ')(*MARK:' . $type . ')'; |
193: | } |
194: | |
195: | return '~' . implode('|', $patterns) . '~Asi'; |
196: | } |
197: | |
198: | } |
199: | |