1use core::cell::{Cell, RefCell};
2
3use alloc::{
4 boxed::Box,
5 string::{String, ToString},
6 vec,
7 vec::Vec,
8};
9
10use crate::{
11 error::Error,
12 hir::{self, Config, Flags, Hir, HirKind},
13};
14
15const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting";
28const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups";
29const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name";
30const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'";
31const ERR_UNCLOSED_GROUP_QUESTION: &str =
32 "expected closing ')', but got end of pattern";
33const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('";
34const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported";
35const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed";
36const ERR_MISSING_GROUP_NAME: &str =
37 "expected capture group name, but got end of pattern";
38const ERR_INVALID_GROUP_NAME: &str = "invalid group name";
39const ERR_UNCLOSED_GROUP_NAME: &str =
40 "expected end of capture group name, but got end of pattern";
41const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed";
42const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag";
43const ERR_FLAG_REPEATED_NEGATION: &str =
44 "inline flag negation cannot be repeated";
45const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed";
46const ERR_FLAG_UNEXPECTED_EOF: &str =
47 "expected ':' or ')' to end inline flags, but got end of pattern";
48const ERR_FLAG_DANGLING_NEGATION: &str =
49 "inline flags cannot end with negation directive";
50const ERR_DECIMAL_NO_DIGITS: &str =
51 "expected decimal number, but found no digits";
52const ERR_DECIMAL_INVALID: &str = "got invalid decimal number";
53const ERR_HEX_BRACE_INVALID_DIGIT: &str =
54 "expected hexadecimal number in braces, but got non-hex digit";
55const ERR_HEX_BRACE_UNEXPECTED_EOF: &str =
56 "expected hexadecimal number, but saw end of pattern before closing brace";
57const ERR_HEX_BRACE_EMPTY: &str =
58 "expected hexadecimal number in braces, but got no digits";
59const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces";
60const ERR_HEX_FIXED_UNEXPECTED_EOF: &str =
61 "expected fixed length hexadecimal number, but saw end of pattern first";
62const ERR_HEX_FIXED_INVALID_DIGIT: &str =
63 "expected fixed length hexadecimal number, but got non-hex digit";
64const ERR_HEX_FIXED_INVALID: &str =
65 "got invalid fixed length hexadecimal number";
66const ERR_HEX_UNEXPECTED_EOF: &str =
67 "expected hexadecimal number, but saw end of pattern first";
68const ERR_ESCAPE_UNEXPECTED_EOF: &str =
69 "saw start of escape sequence, but saw end of pattern before it finished";
70const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported";
71const ERR_UNICODE_CLASS_UNSUPPORTED: &str =
72 "Unicode character classes are not supported";
73const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence";
74const ERR_POSIX_CLASS_UNRECOGNIZED: &str =
75 "unrecognized POSIX character class";
76const ERR_UNCOUNTED_REP_SUB_MISSING: &str =
77 "uncounted repetition operator must be applied to a sub-expression";
78const ERR_COUNTED_REP_SUB_MISSING: &str =
79 "counted repetition operator must be applied to a sub-expression";
80const ERR_COUNTED_REP_UNCLOSED: &str =
81 "found unclosed counted repetition operator";
82const ERR_COUNTED_REP_MIN_UNCLOSED: &str =
83 "found incomplete and unclosed counted repetition operator";
84const ERR_COUNTED_REP_COMMA_UNCLOSED: &str =
85 "found counted repetition operator with a comma that is unclosed";
86const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str =
87 "found counted repetition with min and max that is unclosed";
88const ERR_COUNTED_REP_INVALID: &str =
89 "expected closing brace for counted repetition, but got something else";
90const ERR_COUNTED_REP_INVALID_RANGE: &str =
91 "found counted repetition with a min bigger than its max";
92const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str =
93 "non-empty character class has no closing bracket";
94const ERR_CLASS_INVALID_RANGE_ITEM: &str =
95 "character class ranges must start and end with a single character";
96const ERR_CLASS_INVALID_ITEM: &str =
97 "invalid escape sequence in character class";
98const ERR_CLASS_UNCLOSED_AFTER_DASH: &str =
99 "non-empty character class has no closing bracket after dash";
100const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str =
101 "negated character class has no closing bracket";
102const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str =
103 "character class begins with literal ']' but has no closing bracket";
104const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class";
105const ERR_CLASS_UNCLOSED: &str = "found unclosed character class";
106const ERR_CLASS_NEST_UNSUPPORTED: &str =
107 "nested character classes are not supported";
108const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str =
109 "character class intersection is not supported";
110const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
111 "character class difference is not supported";
112const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str =
113 "character class symmetric difference is not supported";
114const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str =
115 "special word boundary assertion is unclosed or has an invalid character";
116const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str =
117 "special word boundary assertion is unrecognized";
118const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str =
119 "found start of special word boundary or repetition without an end";
120
121#[derive(Clone, Debug)]
129pub(super) struct Parser<'a> {
130 config: Config,
132 pattern: &'a str,
134 depth: Cell<u32>,
138 pos: Cell<usize>,
140 char: Cell<Option<char>>,
145 capture_index: Cell<u32>,
147 flags: RefCell<Flags>,
149 capture_names: RefCell<Vec<String>>,
152}
153
154impl<'a> Parser<'a> {
156 pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> {
158 Parser {
159 config,
160 pattern,
161 depth: Cell::new(0),
162 pos: Cell::new(0),
163 char: Cell::new(pattern.chars().next()),
164 capture_index: Cell::new(0),
165 flags: RefCell::new(config.flags),
166 capture_names: RefCell::new(vec![]),
167 }
168 }
169
170 fn pattern(&self) -> &str {
172 self.pattern
173 }
174
175 fn pos(&self) -> usize {
180 self.pos.get()
181 }
182
183 fn increment_depth(&self) -> Result<u32, Error> {
190 let old = self.depth.get();
191 if old > self.config.nest_limit {
192 return Err(Error::new(ERR_TOO_MUCH_NESTING));
193 }
194 let new = old.checked_add(1).unwrap();
197 self.depth.set(new);
198 Ok(old)
199 }
200
201 fn decrement_depth(&self) {
205 let old = self.depth.get();
206 let new = old.checked_sub(1).unwrap();
209 self.depth.set(new);
210 }
211
212 fn char(&self) -> char {
216 self.char.get().expect("codepoint, but parser is done")
217 }
218
219 fn is_done(&self) -> bool {
221 self.pos() == self.pattern.len()
222 }
223
224 fn flags(&self) -> Flags {
226 *self.flags.borrow()
227 }
228
229 fn bump(&self) -> bool {
233 if self.is_done() {
234 return false;
235 }
236 self.pos.set(self.pos() + self.char().len_utf8());
237 self.char.set(self.pattern()[self.pos()..].chars().next());
238 self.char.get().is_some()
239 }
240
241 fn bump_if(&self, prefix: &str) -> bool {
246 if self.pattern()[self.pos()..].starts_with(prefix) {
247 for _ in 0..prefix.chars().count() {
248 self.bump();
249 }
250 true
251 } else {
252 false
253 }
254 }
255
256 fn bump_and_bump_space(&self) -> bool {
259 if !self.bump() {
260 return false;
261 }
262 self.bump_space();
263 !self.is_done()
264 }
265
266 fn bump_space(&self) {
276 if !self.flags().ignore_whitespace {
277 return;
278 }
279 while !self.is_done() {
280 if self.char().is_whitespace() {
281 self.bump();
282 } else if self.char() == '#' {
283 self.bump();
284 while !self.is_done() {
285 let c = self.char();
286 self.bump();
287 if c == '\n' {
288 break;
289 }
290 }
291 } else {
292 break;
293 }
294 }
295 }
296
297 fn peek(&self) -> Option<char> {
301 if self.is_done() {
302 return None;
303 }
304 self.pattern()[self.pos() + self.char().len_utf8()..].chars().next()
305 }
306
307 fn peek_space(&self) -> Option<char> {
310 if !self.flags().ignore_whitespace {
311 return self.peek();
312 }
313 if self.is_done() {
314 return None;
315 }
316 let mut start = self.pos() + self.char().len_utf8();
317 let mut in_comment = false;
318 for (i, ch) in self.pattern()[start..].char_indices() {
319 if ch.is_whitespace() {
320 continue;
321 } else if !in_comment && ch == '#' {
322 in_comment = true;
323 } else if in_comment && ch == '\n' {
324 in_comment = false;
325 } else {
326 start += i;
327 break;
328 }
329 }
330 self.pattern()[start..].chars().next()
331 }
332
333 fn next_capture_index(&self) -> Result<u32, Error> {
340 let current = self.capture_index.get();
341 let next = current
342 .checked_add(1)
343 .ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?;
344 self.capture_index.set(next);
345 Ok(next)
346 }
347
348 fn add_capture_name(&self, name: &str) -> Result<(), Error> {
351 let mut names = self.capture_names.borrow_mut();
352 match names.binary_search_by(|n| name.cmp(n)) {
353 Ok(_) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)),
354 Err(i) => {
355 names.insert(i, name.to_string());
356 Ok(())
357 }
358 }
359 }
360
361 fn is_lookaround_prefix(&self) -> bool {
369 self.bump_if("?=")
370 || self.bump_if("?!")
371 || self.bump_if("?<=")
372 || self.bump_if("?<!")
373 }
374}
375
376impl<'a> Parser<'a> {
379 pub(super) fn parse(&self) -> Result<Hir, Error> {
380 let hir = self.parse_inner()?;
381 check_hir_nesting(&hir, self.config.nest_limit)?;
394 Ok(hir)
395 }
396
397 fn parse_inner(&self) -> Result<Hir, Error> {
398 let depth = self.increment_depth()?;
399 let mut alternates = vec![];
400 let mut concat = vec![];
401 loop {
402 self.bump_space();
403 if self.is_done() {
404 break;
405 }
406 match self.char() {
407 '(' => {
408 let oldflags = *self.flags.borrow();
411 if let Some(sub) = self.parse_group()? {
412 concat.push(sub);
413 *self.flags.borrow_mut() = oldflags;
419 }
420 if self.char.get() != Some(')') {
421 return Err(Error::new(ERR_UNCLOSED_GROUP));
422 }
423 self.bump();
424 }
425 ')' => {
426 if depth == 0 {
427 return Err(Error::new(ERR_UNOPENED_GROUP));
428 }
429 break;
430 }
431 '|' => {
432 alternates.push(Hir::concat(core::mem::take(&mut concat)));
433 self.bump();
434 }
435 '[' => concat.push(self.parse_class()?),
436 '?' | '*' | '+' => {
437 concat = self.parse_uncounted_repetition(concat)?;
438 }
439 '{' => {
440 concat = self.parse_counted_repetition(concat)?;
441 }
442 _ => concat.push(self.parse_primitive()?),
443 }
444 }
445 self.decrement_depth();
446 alternates.push(Hir::concat(concat));
447 Ok(Hir::alternation(alternates))
449 }
450
451 fn parse_primitive(&self) -> Result<Hir, Error> {
456 let ch = self.char();
457 self.bump();
458 match ch {
459 '\\' => self.parse_escape(),
460 '.' => Ok(self.hir_dot()),
461 '^' => Ok(self.hir_anchor_start()),
462 '$' => Ok(self.hir_anchor_end()),
463 ch => Ok(self.hir_char(ch)),
464 }
465 }
466
467 fn parse_escape(&self) -> Result<Hir, Error> {
474 if self.is_done() {
475 return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF));
476 }
477 let ch = self.char();
478 match ch {
480 '0'..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)),
481 'p' | 'P' => {
482 return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED))
483 }
484 'x' | 'u' | 'U' => return self.parse_hex(),
485 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
486 return Ok(self.parse_perl_class());
487 }
488 _ => {}
489 }
490
491 self.bump();
493 if hir::is_meta_character(ch) || hir::is_escapable_character(ch) {
494 return Ok(self.hir_char(ch));
495 }
496 let special = |ch| Ok(self.hir_char(ch));
497 match ch {
498 'a' => special('\x07'),
499 'f' => special('\x0C'),
500 't' => special('\t'),
501 'n' => special('\n'),
502 'r' => special('\r'),
503 'v' => special('\x0B'),
504 'A' => Ok(Hir::look(hir::Look::Start)),
505 'z' => Ok(Hir::look(hir::Look::End)),
506 'b' => {
507 let mut hir = Hir::look(hir::Look::Word);
508 if !self.is_done() && self.char() == '{' {
509 if let Some(special) =
510 self.maybe_parse_special_word_boundary()?
511 {
512 hir = special;
513 }
514 }
515 Ok(hir)
516 }
517 'B' => Ok(Hir::look(hir::Look::WordNegate)),
518 '<' => Ok(Hir::look(hir::Look::WordStart)),
519 '>' => Ok(Hir::look(hir::Look::WordEnd)),
520 _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)),
521 }
522 }
523
524 fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> {
544 assert_eq!(self.char(), '{');
545
546 let is_valid_char = |c| match c {
547 'A'..='Z' | 'a'..='z' | '-' => true,
548 _ => false,
549 };
550 let start = self.pos();
551 if !self.bump_and_bump_space() {
552 return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF));
553 }
554 if !is_valid_char(self.char()) {
559 self.pos.set(start);
560 self.char.set(Some('{'));
561 return Ok(None);
562 }
563
564 let mut scratch = String::new();
566 while !self.is_done() && is_valid_char(self.char()) {
567 scratch.push(self.char());
568 self.bump_and_bump_space();
569 }
570 if self.is_done() || self.char() != '}' {
571 return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED));
572 }
573 self.bump();
574 let kind = match scratch.as_str() {
575 "start" => hir::Look::WordStart,
576 "end" => hir::Look::WordEnd,
577 "start-half" => hir::Look::WordStartHalf,
578 "end-half" => hir::Look::WordEndHalf,
579 _ => {
580 return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED))
581 }
582 };
583 Ok(Some(Hir::look(kind)))
584 }
585
586 fn parse_hex(&self) -> Result<Hir, Error> {
591 let digit_len = match self.char() {
592 'x' => 2,
593 'u' => 4,
594 'U' => 8,
595 unk => unreachable!(
596 "invalid start of fixed length hexadecimal number {unk}"
597 ),
598 };
599 if !self.bump_and_bump_space() {
600 return Err(Error::new(ERR_HEX_UNEXPECTED_EOF));
601 }
602 if self.char() == '{' {
603 self.parse_hex_brace()
604 } else {
605 self.parse_hex_digits(digit_len)
606 }
607 }
608
609 fn parse_hex_digits(&self, digit_len: usize) -> Result<Hir, Error> {
617 let mut scratch = String::new();
618 for i in 0..digit_len {
619 if i > 0 && !self.bump_and_bump_space() {
620 return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF));
621 }
622 if !is_hex(self.char()) {
623 return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT));
624 }
625 scratch.push(self.char());
626 }
627 self.bump_and_bump_space();
630 match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) {
631 None => Err(Error::new(ERR_HEX_FIXED_INVALID)),
632 Some(ch) => Ok(self.hir_char(ch)),
633 }
634 }
635
636 fn parse_hex_brace(&self) -> Result<Hir, Error> {
640 let mut scratch = String::new();
641 while self.bump_and_bump_space() && self.char() != '}' {
642 if !is_hex(self.char()) {
643 return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT));
644 }
645 scratch.push(self.char());
646 }
647 if self.is_done() {
648 return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF));
649 }
650 assert_eq!(self.char(), '}');
651 self.bump_and_bump_space();
652
653 if scratch.is_empty() {
654 return Err(Error::new(ERR_HEX_BRACE_EMPTY));
655 }
656 match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) {
657 None => Err(Error::new(ERR_HEX_BRACE_INVALID)),
658 Some(ch) => Ok(self.hir_char(ch)),
659 }
660 }
661
662 fn parse_decimal(&self) -> Result<u32, Error> {
672 let mut scratch = String::new();
673 while !self.is_done() && self.char().is_whitespace() {
674 self.bump();
675 }
676 while !self.is_done() && '0' <= self.char() && self.char() <= '9' {
677 scratch.push(self.char());
678 self.bump_and_bump_space();
679 }
680 while !self.is_done() && self.char().is_whitespace() {
681 self.bump_and_bump_space();
682 }
683 let digits = scratch.as_str();
684 if digits.is_empty() {
685 return Err(Error::new(ERR_DECIMAL_NO_DIGITS));
686 }
687 match u32::from_str_radix(digits, 10).ok() {
688 Some(n) => Ok(n),
689 None => Err(Error::new(ERR_DECIMAL_INVALID)),
690 }
691 }
692
693 fn parse_uncounted_repetition(
709 &self,
710 mut concat: Vec<Hir>,
711 ) -> Result<Vec<Hir>, Error> {
712 let sub = match concat.pop() {
713 Some(hir) => Box::new(hir),
714 None => {
715 return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING));
716 }
717 };
718 let (min, max) = match self.char() {
719 '?' => (0, Some(1)),
720 '*' => (0, None),
721 '+' => (1, None),
722 unk => unreachable!("unrecognized repetition operator '{unk}'"),
723 };
724 let mut greedy = true;
725 if self.bump() && self.char() == '?' {
726 greedy = false;
727 self.bump();
728 }
729 if self.flags().swap_greed {
730 greedy = !greedy;
731 }
732 concat.push(Hir::repetition(hir::Repetition {
733 min,
734 max,
735 greedy,
736 sub,
737 }));
738 Ok(concat)
739 }
740
741 fn parse_counted_repetition(
756 &self,
757 mut concat: Vec<Hir>,
758 ) -> Result<Vec<Hir>, Error> {
759 assert_eq!(self.char(), '{', "expected opening brace");
760 let sub = match concat.pop() {
761 Some(hir) => Box::new(hir),
762 None => {
763 return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING));
764 }
765 };
766 if !self.bump_and_bump_space() {
767 return Err(Error::new(ERR_COUNTED_REP_UNCLOSED));
768 }
769 let min = self.parse_decimal()?;
770 let mut max = Some(min);
771 if self.is_done() {
772 return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED));
773 }
774 if self.char() == ',' {
775 if !self.bump_and_bump_space() {
776 return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED));
777 }
778 if self.char() != '}' {
779 max = Some(self.parse_decimal()?);
780 } else {
781 max = None;
782 }
783 if self.is_done() {
784 return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED));
785 }
786 }
787 if self.char() != '}' {
788 return Err(Error::new(ERR_COUNTED_REP_INVALID));
789 }
790
791 let mut greedy = true;
792 if self.bump_and_bump_space() && self.char() == '?' {
793 greedy = false;
794 self.bump();
795 }
796 if self.flags().swap_greed {
797 greedy = !greedy;
798 }
799
800 if max.map_or(false, |max| min > max) {
801 return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE));
802 }
803 concat.push(Hir::repetition(hir::Repetition {
804 min,
805 max,
806 greedy,
807 sub,
808 }));
809 Ok(concat)
810 }
811
812 fn parse_group(&self) -> Result<Option<Hir>, Error> {
818 assert_eq!(self.char(), '(');
819 self.bump_and_bump_space();
820 if self.is_lookaround_prefix() {
821 return Err(Error::new(ERR_LOOK_UNSUPPORTED));
822 }
823 if self.bump_if("?P<") || self.bump_if("?<") {
824 let index = self.next_capture_index()?;
825 let name = Some(Box::from(self.parse_capture_name()?));
826 let sub = Box::new(self.parse_inner()?);
827 let cap = hir::Capture { index, name, sub };
828 Ok(Some(Hir::capture(cap)))
829 } else if self.bump_if("?") {
830 if self.is_done() {
831 return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION));
832 }
833 let start = self.pos();
834 *self.flags.borrow_mut() = self.parse_flags()?;
836 let consumed = self.pos() - start;
837 if self.char() == ')' {
838 if consumed == 0 {
840 return Err(Error::new(ERR_EMPTY_FLAGS));
841 }
842 Ok(None)
843 } else {
844 assert_eq!(':', self.char());
845 self.bump();
846 self.parse_inner().map(Some)
847 }
848 } else {
849 let index = self.next_capture_index()?;
850 let sub = Box::new(self.parse_inner()?);
851 let cap = hir::Capture { index, name: None, sub };
852 Ok(Some(Hir::capture(cap)))
853 }
854 }
855
856 fn parse_capture_name(&self) -> Result<&str, Error> {
861 if self.is_done() {
862 return Err(Error::new(ERR_MISSING_GROUP_NAME));
863 }
864 let start = self.pos();
865 loop {
866 if self.char() == '>' {
867 break;
868 }
869 if !is_capture_char(self.char(), self.pos() == start) {
870 return Err(Error::new(ERR_INVALID_GROUP_NAME));
871 }
872 if !self.bump() {
873 break;
874 }
875 }
876 let end = self.pos();
877 if self.is_done() {
878 return Err(Error::new(ERR_UNCLOSED_GROUP_NAME));
879 }
880 assert_eq!(self.char(), '>');
881 self.bump();
882 let name = &self.pattern()[start..end];
883 if name.is_empty() {
884 return Err(Error::new(ERR_EMPTY_GROUP_NAME));
885 }
886 self.add_capture_name(name)?;
887 Ok(name)
888 }
889
890 fn parse_flags(&self) -> Result<Flags, Error> {
905 let mut flags = *self.flags.borrow();
906 let mut negate = false;
907 let mut last_was_negation = false;
910 let mut seen = [false; 128];
913 while self.char() != ':' && self.char() != ')' {
914 if self.char() == '-' {
915 last_was_negation = true;
916 if negate {
917 return Err(Error::new(ERR_FLAG_REPEATED_NEGATION));
918 }
919 negate = true;
920 } else {
921 last_was_negation = false;
922 self.parse_flag(&mut flags, negate)?;
923 let flag_byte = u8::try_from(self.char()).unwrap();
926 if seen[usize::from(flag_byte)] {
927 return Err(Error::new(ERR_FLAG_DUPLICATE));
928 }
929 seen[usize::from(flag_byte)] = true;
930 }
931 if !self.bump() {
932 return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF));
933 }
934 }
935 if last_was_negation {
936 return Err(Error::new(ERR_FLAG_DANGLING_NEGATION));
937 }
938 Ok(flags)
939 }
940
941 fn parse_flag(
950 &self,
951 flags: &mut Flags,
952 negate: bool,
953 ) -> Result<(), Error> {
954 let enabled = !negate;
955 match self.char() {
956 'i' => flags.case_insensitive = enabled,
957 'm' => flags.multi_line = enabled,
958 's' => flags.dot_matches_new_line = enabled,
959 'U' => flags.swap_greed = enabled,
960 'R' => flags.crlf = enabled,
961 'x' => flags.ignore_whitespace = enabled,
962 'u' => {}
968 _ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)),
969 }
970 Ok(())
971 }
972
973 fn parse_class(&self) -> Result<Hir, Error> {
980 assert_eq!(self.char(), '[');
981
982 let mut union = vec![];
983 if !self.bump_and_bump_space() {
984 return Err(Error::new(ERR_CLASS_UNCLOSED));
985 }
986 let negate = if self.char() != '^' {
988 false
989 } else {
990 if !self.bump_and_bump_space() {
991 return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION));
992 }
993 true
994 };
995 while self.char() == '-' {
997 union.push(hir::ClassRange { start: '-', end: '-' });
998 if !self.bump_and_bump_space() {
999 return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
1000 }
1001 }
1002 if union.is_empty() && self.char() == ']' {
1005 union.push(hir::ClassRange { start: ']', end: ']' });
1006 if !self.bump_and_bump_space() {
1007 return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING));
1008 }
1009 }
1010 loop {
1011 self.bump_space();
1012 if self.is_done() {
1013 return Err(Error::new(ERR_CLASS_UNCLOSED));
1014 }
1015 match self.char() {
1016 '[' => {
1017 if let Some(class) = self.maybe_parse_posix_class() {
1021 union.extend_from_slice(&class.ranges);
1022 continue;
1023 }
1024 return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED));
1026 }
1027 ']' => {
1028 self.bump();
1029 let mut class = hir::Class::new(union);
1030 if self.flags().case_insensitive {
1035 class.ascii_case_fold();
1036 }
1037 if negate {
1038 class.negate();
1039 }
1040 return Ok(Hir::class(class));
1041 }
1042 '&' if self.peek() == Some('&') => {
1043 return Err(Error::new(
1044 ERR_CLASS_INTERSECTION_UNSUPPORTED,
1045 ));
1046 }
1047 '-' if self.peek() == Some('-') => {
1048 return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED));
1049 }
1050 '~' if self.peek() == Some('~') => {
1051 return Err(Error::new(
1052 ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED,
1053 ));
1054 }
1055 _ => self.parse_class_range(&mut union)?,
1056 }
1057 }
1058 }
1059
1060 fn parse_class_range(
1072 &self,
1073 union: &mut Vec<hir::ClassRange>,
1074 ) -> Result<(), Error> {
1075 let prim1 = self.parse_class_item()?;
1076 self.bump_space();
1077 if self.is_done() {
1078 return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM));
1079 }
1080 if self.char() != '-'
1088 || self.peek_space() == Some(']')
1089 || self.peek_space() == Some('-')
1090 {
1091 union.extend_from_slice(&into_class_item_ranges(prim1)?);
1092 return Ok(());
1093 }
1094 if !self.bump_and_bump_space() {
1097 return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
1098 }
1099 let prim2 = self.parse_class_item()?;
1100 let range = hir::ClassRange {
1101 start: into_class_item_range(prim1)?,
1102 end: into_class_item_range(prim2)?,
1103 };
1104 if range.start > range.end {
1105 return Err(Error::new(ERR_CLASS_INVALID_RANGE));
1106 }
1107 union.push(range);
1108 Ok(())
1109 }
1110
1111 fn parse_class_item(&self) -> Result<Hir, Error> {
1122 let ch = self.char();
1123 self.bump();
1124 if ch == '\\' {
1125 self.parse_escape()
1126 } else {
1127 Ok(Hir::char(ch))
1128 }
1129 }
1130
1131 fn maybe_parse_posix_class(&self) -> Option<hir::Class> {
1140 assert_eq!(self.char(), '[');
1161
1162 let start_pos = self.pos();
1164 let start_char = self.char.get();
1165 let reset = || {
1166 self.pos.set(start_pos);
1167 self.char.set(start_char);
1168 };
1169
1170 let mut negated = false;
1171 if !self.bump() || self.char() != ':' {
1172 reset();
1173 return None;
1174 }
1175 if !self.bump() {
1176 reset();
1177 return None;
1178 }
1179 if self.char() == '^' {
1180 negated = true;
1181 if !self.bump() {
1182 reset();
1183 return None;
1184 }
1185 }
1186 let name_start = self.pos();
1187 while self.char() != ':' && self.bump() {}
1188 if self.is_done() {
1189 reset();
1190 return None;
1191 }
1192 let name = &self.pattern()[name_start..self.pos()];
1193 if !self.bump_if(":]") {
1194 reset();
1195 return None;
1196 }
1197 if let Ok(ranges) = posix_class(name) {
1198 let mut class = hir::Class::new(ranges);
1199 if negated {
1200 class.negate();
1201 }
1202 return Some(class);
1203 }
1204 reset();
1205 None
1206 }
1207
1208 fn parse_perl_class(&self) -> Hir {
1212 let ch = self.char();
1213 self.bump();
1214 let mut class = hir::Class::new(match ch {
1215 'd' | 'D' => posix_class("digit").unwrap(),
1216 's' | 'S' => posix_class("space").unwrap(),
1217 'w' | 'W' => posix_class("word").unwrap(),
1218 unk => unreachable!("invalid Perl class \\{unk}"),
1219 });
1220 if ch.is_ascii_uppercase() {
1221 class.negate();
1222 }
1223 Hir::class(class)
1224 }
1225
1226 fn hir_dot(&self) -> Hir {
1227 if self.flags().dot_matches_new_line {
1228 Hir::class(hir::Class::new([hir::ClassRange {
1229 start: '\x00',
1230 end: '\u{10FFFF}',
1231 }]))
1232 } else if self.flags().crlf {
1233 Hir::class(hir::Class::new([
1234 hir::ClassRange { start: '\x00', end: '\x09' },
1235 hir::ClassRange { start: '\x0B', end: '\x0C' },
1236 hir::ClassRange { start: '\x0E', end: '\u{10FFFF}' },
1237 ]))
1238 } else {
1239 Hir::class(hir::Class::new([
1240 hir::ClassRange { start: '\x00', end: '\x09' },
1241 hir::ClassRange { start: '\x0B', end: '\u{10FFFF}' },
1242 ]))
1243 }
1244 }
1245
1246 fn hir_anchor_start(&self) -> Hir {
1247 let look = if self.flags().multi_line {
1248 if self.flags().crlf {
1249 hir::Look::StartCRLF
1250 } else {
1251 hir::Look::StartLF
1252 }
1253 } else {
1254 hir::Look::Start
1255 };
1256 Hir::look(look)
1257 }
1258
1259 fn hir_anchor_end(&self) -> Hir {
1260 let look = if self.flags().multi_line {
1261 if self.flags().crlf {
1262 hir::Look::EndCRLF
1263 } else {
1264 hir::Look::EndLF
1265 }
1266 } else {
1267 hir::Look::End
1268 };
1269 Hir::look(look)
1270 }
1271
1272 fn hir_char(&self, ch: char) -> Hir {
1273 if self.flags().case_insensitive {
1274 let this = hir::ClassRange { start: ch, end: ch };
1275 if let Some(folded) = this.ascii_case_fold() {
1276 return Hir::class(hir::Class::new([this, folded]));
1277 }
1278 }
1279 Hir::char(ch)
1280 }
1281}
1282
1283fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> {
1286 fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> {
1287 if depth > limit {
1288 return Err(Error::new(ERR_TOO_MUCH_NESTING));
1289 }
1290 let Some(next_depth) = depth.checked_add(1) else {
1291 return Err(Error::new(ERR_TOO_MUCH_NESTING));
1292 };
1293 match *hir.kind() {
1294 HirKind::Empty
1295 | HirKind::Char(_)
1296 | HirKind::Class(_)
1297 | HirKind::Look(_) => Ok(()),
1298 HirKind::Repetition(hir::Repetition { ref sub, .. }) => {
1299 recurse(sub, limit, next_depth)
1300 }
1301 HirKind::Capture(hir::Capture { ref sub, .. }) => {
1302 recurse(sub, limit, next_depth)
1303 }
1304 HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => {
1305 for sub in subs.iter() {
1306 recurse(sub, limit, next_depth)?;
1307 }
1308 Ok(())
1309 }
1310 }
1311 }
1312 recurse(hir, limit, 0)
1313}
1314
1315fn into_class_item_range(hir: Hir) -> Result<char, Error> {
1324 match hir.kind {
1325 HirKind::Char(ch) => Ok(ch),
1326 _ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)),
1327 }
1328}
1329
1330fn into_class_item_ranges(
1331 mut hir: Hir,
1332) -> Result<Vec<hir::ClassRange>, Error> {
1333 match core::mem::replace(&mut hir.kind, HirKind::Empty) {
1334 HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]),
1335 HirKind::Class(hir::Class { ranges }) => Ok(ranges),
1336 _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)),
1337 }
1338}
1339
1340fn posix_class(
1344 kind: &str,
1345) -> Result<impl Iterator<Item = hir::ClassRange>, Error> {
1346 let slice: &'static [(u8, u8)] = match kind {
1347 "alnum" => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
1348 "alpha" => &[(b'A', b'Z'), (b'a', b'z')],
1349 "ascii" => &[(b'\x00', b'\x7F')],
1350 "blank" => &[(b'\t', b'\t'), (b' ', b' ')],
1351 "cntrl" => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
1352 "digit" => &[(b'0', b'9')],
1353 "graph" => &[(b'!', b'~')],
1354 "lower" => &[(b'a', b'z')],
1355 "print" => &[(b' ', b'~')],
1356 "punct" => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
1357 "space" => &[
1358 (b'\t', b'\t'),
1359 (b'\n', b'\n'),
1360 (b'\x0B', b'\x0B'),
1361 (b'\x0C', b'\x0C'),
1362 (b'\r', b'\r'),
1363 (b' ', b' '),
1364 ],
1365 "upper" => &[(b'A', b'Z')],
1366 "word" => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
1367 "xdigit" => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
1368 _ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)),
1369 };
1370 Ok(slice.iter().map(|&(start, end)| hir::ClassRange {
1371 start: char::from(start),
1372 end: char::from(end),
1373 }))
1374}
1375
1376fn is_hex(c: char) -> bool {
1378 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
1379}
1380
1381fn is_capture_char(c: char, first: bool) -> bool {
1386 if first {
1387 c == '_' || c.is_alphabetic()
1388 } else {
1389 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
1390 }
1391}
1392
1393#[cfg(test)]
1394mod tests {
1395 use super::*;
1396
1397 fn p(pattern: &str) -> Hir {
1398 Parser::new(Config::default(), pattern).parse_inner().unwrap()
1399 }
1400
1401 fn perr(pattern: &str) -> String {
1402 Parser::new(Config::default(), pattern)
1403 .parse_inner()
1404 .unwrap_err()
1405 .to_string()
1406 }
1407
1408 fn class<I: IntoIterator<Item = (char, char)>>(it: I) -> Hir {
1409 Hir::class(hir::Class::new(
1410 it.into_iter().map(|(start, end)| hir::ClassRange { start, end }),
1411 ))
1412 }
1413
1414 fn singles<I: IntoIterator<Item = char>>(it: I) -> Hir {
1415 Hir::class(hir::Class::new(
1416 it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }),
1417 ))
1418 }
1419
1420 fn posix(name: &str) -> Hir {
1421 Hir::class(hir::Class::new(posix_class(name).unwrap()))
1422 }
1423
1424 fn cap(index: u32, sub: Hir) -> Hir {
1425 Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) })
1426 }
1427
1428 fn named_cap(index: u32, name: &str, sub: Hir) -> Hir {
1429 Hir::capture(hir::Capture {
1430 index,
1431 name: Some(Box::from(name)),
1432 sub: Box::new(sub),
1433 })
1434 }
1435
1436 #[test]
1437 fn ok_literal() {
1438 assert_eq!(p("a"), Hir::char('a'));
1439 assert_eq!(p("ab"), Hir::concat(vec![Hir::char('a'), Hir::char('b')]));
1440 assert_eq!(p("💩"), Hir::char('💩'));
1441 }
1442
1443 #[test]
1444 fn ok_meta_escapes() {
1445 assert_eq!(p(r"\*"), Hir::char('*'));
1446 assert_eq!(p(r"\+"), Hir::char('+'));
1447 assert_eq!(p(r"\?"), Hir::char('?'));
1448 assert_eq!(p(r"\|"), Hir::char('|'));
1449 assert_eq!(p(r"\("), Hir::char('('));
1450 assert_eq!(p(r"\)"), Hir::char(')'));
1451 assert_eq!(p(r"\^"), Hir::char('^'));
1452 assert_eq!(p(r"\$"), Hir::char('$'));
1453 assert_eq!(p(r"\["), Hir::char('['));
1454 assert_eq!(p(r"\]"), Hir::char(']'));
1455 }
1456
1457 #[test]
1458 fn ok_special_escapes() {
1459 assert_eq!(p(r"\a"), Hir::char('\x07'));
1460 assert_eq!(p(r"\f"), Hir::char('\x0C'));
1461 assert_eq!(p(r"\t"), Hir::char('\t'));
1462 assert_eq!(p(r"\n"), Hir::char('\n'));
1463 assert_eq!(p(r"\r"), Hir::char('\r'));
1464 assert_eq!(p(r"\v"), Hir::char('\x0B'));
1465 assert_eq!(p(r"\A"), Hir::look(hir::Look::Start));
1466 assert_eq!(p(r"\z"), Hir::look(hir::Look::End));
1467 assert_eq!(p(r"\b"), Hir::look(hir::Look::Word));
1468 assert_eq!(p(r"\B"), Hir::look(hir::Look::WordNegate));
1469 }
1470
1471 #[test]
1472 fn ok_hex() {
1473 assert_eq!(p(r"\x41"), Hir::char('A'));
1475 assert_eq!(p(r"\u2603"), Hir::char('☃'));
1476 assert_eq!(p(r"\U0001F4A9"), Hir::char('💩'));
1477 assert_eq!(p(r"\x{1F4A9}"), Hir::char('💩'));
1479 assert_eq!(p(r"\u{1F4A9}"), Hir::char('💩'));
1480 assert_eq!(p(r"\U{1F4A9}"), Hir::char('💩'));
1481 }
1482
1483 #[test]
1484 fn ok_perl() {
1485 assert_eq!(p(r"\d"), posix("digit"));
1486 assert_eq!(p(r"\s"), posix("space"));
1487 assert_eq!(p(r"\w"), posix("word"));
1488
1489 let negated = |name| {
1490 let mut class = hir::Class::new(posix_class(name).unwrap());
1491 class.negate();
1492 Hir::class(class)
1493 };
1494 assert_eq!(p(r"\D"), negated("digit"));
1495 assert_eq!(p(r"\S"), negated("space"));
1496 assert_eq!(p(r"\W"), negated("word"));
1497 }
1498
1499 #[test]
1500 fn ok_flags_and_primitives() {
1501 assert_eq!(p(r"a"), Hir::char('a'));
1502 assert_eq!(p(r"(?i:a)"), singles(['A', 'a']));
1503
1504 assert_eq!(p(r"^"), Hir::look(hir::Look::Start));
1505 assert_eq!(p(r"(?m:^)"), Hir::look(hir::Look::StartLF));
1506 assert_eq!(p(r"(?mR:^)"), Hir::look(hir::Look::StartCRLF));
1507
1508 assert_eq!(p(r"$"), Hir::look(hir::Look::End));
1509 assert_eq!(p(r"(?m:$)"), Hir::look(hir::Look::EndLF));
1510 assert_eq!(p(r"(?mR:$)"), Hir::look(hir::Look::EndCRLF));
1511
1512 assert_eq!(p(r"."), class([('\x00', '\x09'), ('\x0B', '\u{10FFFF}')]));
1513 assert_eq!(
1514 p(r"(?R:.)"),
1515 class([
1516 ('\x00', '\x09'),
1517 ('\x0B', '\x0C'),
1518 ('\x0E', '\u{10FFFF}'),
1519 ])
1520 );
1521 assert_eq!(p(r"(?s:.)"), class([('\x00', '\u{10FFFF}')]));
1522 assert_eq!(p(r"(?sR:.)"), class([('\x00', '\u{10FFFF}')]));
1523 }
1524
1525 #[test]
1526 fn ok_alternate() {
1527 assert_eq!(
1528 p(r"a|b"),
1529 Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1530 );
1531 assert_eq!(
1532 p(r"(?:a|b)"),
1533 Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1534 );
1535
1536 assert_eq!(
1537 p(r"(a|b)"),
1538 cap(1, Hir::alternation(vec![Hir::char('a'), Hir::char('b')]))
1539 );
1540 assert_eq!(
1541 p(r"(?<foo>a|b)"),
1542 named_cap(
1543 1,
1544 "foo",
1545 Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1546 )
1547 );
1548
1549 assert_eq!(
1550 p(r"a|b|c"),
1551 Hir::alternation(vec![
1552 Hir::char('a'),
1553 Hir::char('b'),
1554 Hir::char('c')
1555 ])
1556 );
1557
1558 assert_eq!(
1559 p(r"ax|by|cz"),
1560 Hir::alternation(vec![
1561 Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
1562 Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
1563 Hir::concat(vec![Hir::char('c'), Hir::char('z')]),
1564 ])
1565 );
1566 assert_eq!(
1567 p(r"(ax|(by|(cz)))"),
1568 cap(
1569 1,
1570 Hir::alternation(vec![
1571 Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
1572 cap(
1573 2,
1574 Hir::alternation(vec![
1575 Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
1576 cap(
1577 3,
1578 Hir::concat(vec![
1579 Hir::char('c'),
1580 Hir::char('z')
1581 ])
1582 ),
1583 ])
1584 ),
1585 ])
1586 )
1587 );
1588
1589 assert_eq!(
1590 p(r"|"),
1591 Hir::alternation(vec![Hir::empty(), Hir::empty()])
1592 );
1593 assert_eq!(
1594 p(r"||"),
1595 Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()])
1596 );
1597
1598 assert_eq!(
1599 p(r"a|"),
1600 Hir::alternation(vec![Hir::char('a'), Hir::empty()])
1601 );
1602 assert_eq!(
1603 p(r"|a"),
1604 Hir::alternation(vec![Hir::empty(), Hir::char('a')])
1605 );
1606
1607 assert_eq!(
1608 p(r"(|)"),
1609 cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()]))
1610 );
1611 assert_eq!(
1612 p(r"(a|)"),
1613 cap(1, Hir::alternation(vec![Hir::char('a'), Hir::empty()]))
1614 );
1615 assert_eq!(
1616 p(r"(|a)"),
1617 cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a')]))
1618 );
1619 }
1620
1621 #[test]
1622 fn ok_flag_group() {
1623 assert_eq!(
1624 p("a(?i:b)"),
1625 Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
1626 );
1627 }
1628
1629 #[test]
1630 fn ok_flag_directive() {
1631 assert_eq!(p("(?i)a"), singles(['A', 'a']));
1632 assert_eq!(p("a(?i)"), Hir::char('a'));
1633 assert_eq!(
1634 p("a(?i)b"),
1635 Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
1636 );
1637 assert_eq!(
1638 p("a(?i)a(?-i)a"),
1639 Hir::concat(vec![
1640 Hir::char('a'),
1641 singles(['A', 'a']),
1642 Hir::char('a'),
1643 ])
1644 );
1645 assert_eq!(
1646 p("a(?:(?i)a)a"),
1647 Hir::concat(vec![
1648 Hir::char('a'),
1649 singles(['A', 'a']),
1650 Hir::char('a'),
1651 ])
1652 );
1653 assert_eq!(
1654 p("a((?i)a)a"),
1655 Hir::concat(vec![
1656 Hir::char('a'),
1657 cap(1, singles(['A', 'a'])),
1658 Hir::char('a'),
1659 ])
1660 );
1661 }
1662
1663 #[test]
1664 fn ok_uncounted_repetition() {
1665 assert_eq!(
1666 p(r"a?"),
1667 Hir::repetition(hir::Repetition {
1668 min: 0,
1669 max: Some(1),
1670 greedy: true,
1671 sub: Box::new(Hir::char('a')),
1672 }),
1673 );
1674 assert_eq!(
1675 p(r"a*"),
1676 Hir::repetition(hir::Repetition {
1677 min: 0,
1678 max: None,
1679 greedy: true,
1680 sub: Box::new(Hir::char('a')),
1681 }),
1682 );
1683 assert_eq!(
1684 p(r"a+"),
1685 Hir::repetition(hir::Repetition {
1686 min: 1,
1687 max: None,
1688 greedy: true,
1689 sub: Box::new(Hir::char('a')),
1690 }),
1691 );
1692
1693 assert_eq!(
1694 p(r"a??"),
1695 Hir::repetition(hir::Repetition {
1696 min: 0,
1697 max: Some(1),
1698 greedy: false,
1699 sub: Box::new(Hir::char('a')),
1700 }),
1701 );
1702 assert_eq!(
1703 p(r"a*?"),
1704 Hir::repetition(hir::Repetition {
1705 min: 0,
1706 max: None,
1707 greedy: false,
1708 sub: Box::new(Hir::char('a')),
1709 }),
1710 );
1711 assert_eq!(
1712 p(r"a+?"),
1713 Hir::repetition(hir::Repetition {
1714 min: 1,
1715 max: None,
1716 greedy: false,
1717 sub: Box::new(Hir::char('a')),
1718 }),
1719 );
1720
1721 assert_eq!(
1722 p(r"a?b"),
1723 Hir::concat(vec![
1724 Hir::repetition(hir::Repetition {
1725 min: 0,
1726 max: Some(1),
1727 greedy: true,
1728 sub: Box::new(Hir::char('a')),
1729 }),
1730 Hir::char('b'),
1731 ]),
1732 );
1733
1734 assert_eq!(
1735 p(r"ab?"),
1736 Hir::concat(vec![
1737 Hir::char('a'),
1738 Hir::repetition(hir::Repetition {
1739 min: 0,
1740 max: Some(1),
1741 greedy: true,
1742 sub: Box::new(Hir::char('b')),
1743 }),
1744 ]),
1745 );
1746
1747 assert_eq!(
1748 p(r"(?:ab)?"),
1749 Hir::repetition(hir::Repetition {
1750 min: 0,
1751 max: Some(1),
1752 greedy: true,
1753 sub: Box::new(Hir::concat(vec![
1754 Hir::char('a'),
1755 Hir::char('b')
1756 ])),
1757 }),
1758 );
1759
1760 assert_eq!(
1761 p(r"(ab)?"),
1762 Hir::repetition(hir::Repetition {
1763 min: 0,
1764 max: Some(1),
1765 greedy: true,
1766 sub: Box::new(cap(
1767 1,
1768 Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1769 )),
1770 }),
1771 );
1772
1773 assert_eq!(
1774 p(r"|a?"),
1775 Hir::alternation(vec![
1776 Hir::empty(),
1777 Hir::repetition(hir::Repetition {
1778 min: 0,
1779 max: Some(1),
1780 greedy: true,
1781 sub: Box::new(Hir::char('a')),
1782 })
1783 ]),
1784 );
1785 }
1786
1787 #[test]
1788 fn ok_counted_repetition() {
1789 assert_eq!(
1790 p(r"a{5}"),
1791 Hir::repetition(hir::Repetition {
1792 min: 5,
1793 max: Some(5),
1794 greedy: true,
1795 sub: Box::new(Hir::char('a')),
1796 }),
1797 );
1798 assert_eq!(
1799 p(r"a{5}?"),
1800 Hir::repetition(hir::Repetition {
1801 min: 5,
1802 max: Some(5),
1803 greedy: false,
1804 sub: Box::new(Hir::char('a')),
1805 }),
1806 );
1807
1808 assert_eq!(
1809 p(r"a{5,}"),
1810 Hir::repetition(hir::Repetition {
1811 min: 5,
1812 max: None,
1813 greedy: true,
1814 sub: Box::new(Hir::char('a')),
1815 }),
1816 );
1817
1818 assert_eq!(
1819 p(r"a{5,9}"),
1820 Hir::repetition(hir::Repetition {
1821 min: 5,
1822 max: Some(9),
1823 greedy: true,
1824 sub: Box::new(Hir::char('a')),
1825 }),
1826 );
1827
1828 assert_eq!(
1829 p(r"ab{5}c"),
1830 Hir::concat(vec![
1831 Hir::char('a'),
1832 Hir::repetition(hir::Repetition {
1833 min: 5,
1834 max: Some(5),
1835 greedy: true,
1836 sub: Box::new(Hir::char('b')),
1837 }),
1838 Hir::char('c'),
1839 ]),
1840 );
1841
1842 assert_eq!(
1843 p(r"a{ 5 }"),
1844 Hir::repetition(hir::Repetition {
1845 min: 5,
1846 max: Some(5),
1847 greedy: true,
1848 sub: Box::new(Hir::char('a')),
1849 }),
1850 );
1851 assert_eq!(
1852 p(r"a{ 5 , 9 }"),
1853 Hir::repetition(hir::Repetition {
1854 min: 5,
1855 max: Some(9),
1856 greedy: true,
1857 sub: Box::new(Hir::char('a')),
1858 }),
1859 );
1860 }
1861
1862 #[test]
1863 fn ok_group_unnamed() {
1864 assert_eq!(p("(a)"), cap(1, Hir::char('a')));
1865 assert_eq!(
1866 p("(ab)"),
1867 cap(1, Hir::concat(vec![Hir::char('a'), Hir::char('b')]))
1868 );
1869 }
1870
1871 #[test]
1872 fn ok_group_named() {
1873 assert_eq!(p("(?P<foo>a)"), named_cap(1, "foo", Hir::char('a')));
1874 assert_eq!(p("(?<foo>a)"), named_cap(1, "foo", Hir::char('a')));
1875
1876 assert_eq!(
1877 p("(?P<foo>ab)"),
1878 named_cap(
1879 1,
1880 "foo",
1881 Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1882 )
1883 );
1884 assert_eq!(
1885 p("(?<foo>ab)"),
1886 named_cap(
1887 1,
1888 "foo",
1889 Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1890 )
1891 );
1892
1893 assert_eq!(p(r"(?<a>z)"), named_cap(1, "a", Hir::char('z')));
1894 assert_eq!(p(r"(?P<a>z)"), named_cap(1, "a", Hir::char('z')));
1895
1896 assert_eq!(p(r"(?<a_1>z)"), named_cap(1, "a_1", Hir::char('z')));
1897 assert_eq!(p(r"(?P<a_1>z)"), named_cap(1, "a_1", Hir::char('z')));
1898
1899 assert_eq!(p(r"(?<a.1>z)"), named_cap(1, "a.1", Hir::char('z')));
1900 assert_eq!(p(r"(?P<a.1>z)"), named_cap(1, "a.1", Hir::char('z')));
1901
1902 assert_eq!(p(r"(?<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z')));
1903 assert_eq!(p(r"(?P<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z')));
1904
1905 assert_eq!(p(r"(?<a¾>z)"), named_cap(1, "a¾", Hir::char('z')));
1906 assert_eq!(p(r"(?P<a¾>z)"), named_cap(1, "a¾", Hir::char('z')));
1907
1908 assert_eq!(p(r"(?<名字>z)"), named_cap(1, "名字", Hir::char('z')));
1909 assert_eq!(p(r"(?P<名字>z)"), named_cap(1, "名字", Hir::char('z')));
1910 }
1911
1912 #[test]
1913 fn ok_class() {
1914 assert_eq!(p(r"[a]"), singles(['a']));
1915 assert_eq!(p(r"[a\]]"), singles(['a', ']']));
1916 assert_eq!(p(r"[a\-z]"), singles(['a', '-', 'z']));
1917 assert_eq!(p(r"[ab]"), class([('a', 'b')]));
1918 assert_eq!(p(r"[a-]"), singles(['a', '-']));
1919 assert_eq!(p(r"[-a]"), singles(['a', '-']));
1920 assert_eq!(p(r"[--a]"), singles(['a', '-']));
1921 assert_eq!(p(r"[---a]"), singles(['a', '-']));
1922 assert_eq!(p(r"[[:alnum:]]"), posix("alnum"));
1923 assert_eq!(p(r"[\w]"), posix("word"));
1924 assert_eq!(p(r"[a\wz]"), posix("word"));
1925 assert_eq!(p(r"[\s\S]"), class([('\x00', '\u{10FFFF}')]));
1926 assert_eq!(p(r"[^\s\S]"), Hir::fail());
1927 assert_eq!(p(r"[a-cx-z]"), class([('a', 'c'), ('x', 'z')]));
1928 assert_eq!(p(r"[☃-⛄]"), class([('☃', '⛄')]));
1929 assert_eq!(p(r"[]]"), singles([']']));
1930 assert_eq!(p(r"[]a]"), singles([']', 'a']));
1931 assert_eq!(p(r"[]\[]"), singles(['[', ']']));
1932 assert_eq!(p(r"[\[]"), singles(['[']));
1933
1934 assert_eq!(p(r"(?i)[a]"), singles(['A', 'a']));
1935 assert_eq!(p(r"(?i)[A]"), singles(['A', 'a']));
1936 assert_eq!(p(r"(?i)[k]"), singles(['K', 'k']));
1937 assert_eq!(p(r"(?i)[s]"), singles(['S', 's']));
1938 assert_eq!(p(r"(?i)[β]"), singles(['β']));
1939
1940 assert_eq!(p(r"[^^]"), class([('\x00', ']'), ('_', '\u{10FFFF}')]));
1941 assert_eq!(
1942 p(r"[^-a]"),
1943 class([('\x00', ','), ('.', '`'), ('b', '\u{10FFFF}')])
1944 );
1945
1946 assert_eq!(
1947 p(r"[-]a]"),
1948 Hir::concat(vec![singles(['-']), Hir::char('a'), Hir::char(']')])
1949 );
1950 }
1951
1952 #[test]
1953 fn ok_verbatim() {
1954 assert_eq!(
1955 p(r"(?x)a{5,9} ?"),
1956 Hir::repetition(hir::Repetition {
1957 min: 5,
1958 max: Some(9),
1959 greedy: false,
1960 sub: Box::new(Hir::char('a')),
1961 })
1962 );
1963 assert_eq!(p(r"(?x)[ a]"), singles(['a']));
1964 assert_eq!(
1965 p(r"(?x)[ ^ a]"),
1966 class([('\x00', '`'), ('b', '\u{10FFFF}')])
1967 );
1968 assert_eq!(p(r"(?x)[ - a]"), singles(['a', '-']));
1969 assert_eq!(p(r"(?x)[ ] a]"), singles([']', 'a']));
1970
1971 assert_eq!(
1972 p(r"(?x)a b"),
1973 Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1974 );
1975 assert_eq!(
1976 p(r"(?x)a b(?-x)a b"),
1977 Hir::concat(vec![
1978 Hir::char('a'),
1979 Hir::char('b'),
1980 Hir::char('a'),
1981 Hir::char(' '),
1982 Hir::char('b'),
1983 ])
1984 );
1985 assert_eq!(
1986 p(r"a (?x:a )a "),
1987 Hir::concat(vec![
1988 Hir::char('a'),
1989 Hir::char(' '),
1990 Hir::char('a'),
1991 Hir::char('a'),
1992 Hir::char(' '),
1993 ])
1994 );
1995 assert_eq!(
1996 p(r"(?x)( ?P<foo> a )"),
1997 named_cap(1, "foo", Hir::char('a')),
1998 );
1999 assert_eq!(p(r"(?x)( a )"), cap(1, Hir::char('a')));
2000 assert_eq!(p(r"(?x)( ?: a )"), Hir::char('a'));
2001 assert_eq!(p(r"(?x)\x { 53 }"), Hir::char('\x53'));
2002 assert_eq!(p(r"(?x)\ "), Hir::char(' '));
2003 }
2004
2005 #[test]
2006 fn ok_comments() {
2007 let pat = "(?x)
2008# This is comment 1.
2009foo # This is comment 2.
2010 # This is comment 3.
2011bar
2012# This is comment 4.";
2013 assert_eq!(
2014 p(pat),
2015 Hir::concat(vec![
2016 Hir::char('f'),
2017 Hir::char('o'),
2018 Hir::char('o'),
2019 Hir::char('b'),
2020 Hir::char('a'),
2021 Hir::char('r'),
2022 ])
2023 );
2024 }
2025
2026 #[test]
2027 fn err_standard() {
2028 assert_eq!(
2029 ERR_TOO_MUCH_NESTING,
2030 perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))"),
2031 );
2032 assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?P<a>y)(?P<a>z)"));
2037 assert_eq!(ERR_UNCLOSED_GROUP, perr("("));
2038 assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?"));
2039 assert_eq!(ERR_UNOPENED_GROUP, perr(")"));
2040 assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)"));
2041 assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)"));
2042 assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)"));
2043 assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<!a)"));
2044 assert_eq!(ERR_EMPTY_FLAGS, perr(r"(?)"));
2045 assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?P<"));
2046 assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?<"));
2047 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?P<1abc>z)"));
2048 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)"));
2049 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)"));
2050 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)"));
2051 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)"));
2052 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<a☃>z)"));
2053 assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?P<foo"));
2054 assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?<foo"));
2055 assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?P<>z)"));
2056 assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)"));
2057 assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)"));
2058 assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)"));
2059 assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)"));
2060 assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)"));
2061 assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is"));
2062 assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)"));
2063 assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}"));
2064 assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{"));
2065 assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A"));
2066 assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}"));
2067 assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}"));
2068 assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA"));
2069 assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ"));
2070 assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA"));
2071 assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ"));
2072 assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800"));
2073 assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF"));
2074 assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x"));
2075 assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\"));
2076 assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0"));
2077 assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1"));
2078 assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8"));
2079 assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL"));
2080 assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}"));
2081 assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i"));
2082 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?"));
2083 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*"));
2084 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+"));
2085 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)"));
2086 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?"));
2087 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?"));
2088 assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}"));
2089 assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})"));
2090 assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}"));
2091 assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{"));
2092 assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5"));
2093 assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5,"));
2094 assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6"));
2095 assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z"));
2096 assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}"));
2097 assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}"));
2098 assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}"));
2099 assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}"));
2100 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a"));
2101 assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]"));
2102 assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]"));
2103 assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]"));
2104 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-"));
2105 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^"));
2106 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]"));
2107 assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]"));
2108 assert_eq!(ERR_CLASS_UNCLOSED, perr(r"["));
2109 assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z"));
2110 assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]"));
2111 assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]"));
2112 assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]"));
2113 assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]"));
2114 assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]"));
2115 assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo"));
2116 assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}"));
2117 assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}"));
2118 assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{"));
2119 assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ "));
2120 }
2121
2122 #[test]
2123 fn err_verbatim() {
2124 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]"));
2126 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a "));
2127 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- "));
2128 assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[ "));
2129 }
2130
2131 #[test]
2135 fn regression_454_nest_too_big() {
2136 let pattern = r#"
2137 2(?:
2138 [45]\d{3}|
2139 7(?:
2140 1[0-267]|
2141 2[0-289]|
2142 3[0-29]|
2143 4[01]|
2144 5[1-3]|
2145 6[013]|
2146 7[0178]|
2147 91
2148 )|
2149 8(?:
2150 0[125]|
2151 [139][1-6]|
2152 2[0157-9]|
2153 41|
2154 6[1-35]|
2155 7[1-5]|
2156 8[1-8]|
2157 90
2158 )|
2159 9(?:
2160 0[0-2]|
2161 1[0-4]|
2162 2[568]|
2163 3[3-6]|
2164 5[5-7]|
2165 6[0167]|
2166 7[15]|
2167 8[0146-9]
2168 )
2169 )\d{4}
2170 "#;
2171 p(pattern);
2172 }
2173
2174 #[test]
2178 fn regression_455_trailing_dash_ignore_whitespace() {
2179 p("(?x)[ / - ]");
2180 p("(?x)[ a - ]");
2181 p("(?x)[
2182 a
2183 - ]
2184 ");
2185 p("(?x)[
2186 a # wat
2187 - ]
2188 ");
2189
2190 perr("(?x)[ / -");
2191 perr("(?x)[ / - ");
2192 perr(
2193 "(?x)[
2194 / -
2195 ",
2196 );
2197 perr(
2198 "(?x)[
2199 / - # wat
2200 ",
2201 );
2202 }
2203
2204 #[test]
2205 fn regression_capture_indices() {
2206 let got = p(r"(a|ab|c|bcd){4,10}(d*)");
2207 assert_eq!(
2208 got,
2209 Hir::concat(vec![
2210 Hir::repetition(hir::Repetition {
2211 min: 4,
2212 max: Some(10),
2213 greedy: true,
2214 sub: Box::new(cap(
2215 1,
2216 Hir::alternation(vec![
2217 Hir::char('a'),
2218 Hir::concat(vec![Hir::char('a'), Hir::char('b')]),
2219 Hir::char('c'),
2220 Hir::concat(vec![
2221 Hir::char('b'),
2222 Hir::char('c'),
2223 Hir::char('d')
2224 ]),
2225 ])
2226 ))
2227 }),
2228 cap(
2229 2,
2230 Hir::repetition(hir::Repetition {
2231 min: 0,
2232 max: None,
2233 greedy: true,
2234 sub: Box::new(Hir::char('d')),
2235 })
2236 ),
2237 ])
2238 );
2239 }
2240}