@@ -3,7 +3,7 @@ use std::error::Error;
33use regex_automata:: {
44 dfa:: { dense, Automaton , OverlappingState } ,
55 nfa:: thompson,
6- HalfMatch , Input , MatchError ,
6+ Anchored , HalfMatch , Input , MatchError ,
77} ;
88
99// Tests that quit bytes in the forward direction work correctly.
@@ -67,3 +67,93 @@ fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
6767 assert_eq ! ( Ok ( Some ( expected) ) , dfa. try_search_fwd( & Input :: new( b" a" ) ) ) ;
6868 Ok ( ( ) )
6969}
70+
71+ // A variant of [`Automaton::is_special_state`]'s doctest, but with universal start states.
72+ #[ test]
73+ fn universal_start_search ( ) -> Result < ( ) , Box < dyn Error > > {
74+ fn find < A : Automaton > (
75+ dfa : & A ,
76+ haystack : & [ u8 ] ,
77+ ) -> Result < Option < HalfMatch > , MatchError > {
78+ let mut state = dfa
79+ . universal_start_state ( Anchored :: No )
80+ . expect ( "regex should not require lookbehind" ) ;
81+ let mut last_match = None ;
82+ // Walk all the bytes in the haystack. We can quit early if we see
83+ // a dead or a quit state. The former means the automaton will
84+ // never transition to any other state. The latter means that the
85+ // automaton entered a condition in which its search failed.
86+ for ( i, & b) in haystack. iter ( ) . enumerate ( ) {
87+ state = dfa. next_state ( state, b) ;
88+ if dfa. is_special_state ( state) {
89+ if dfa. is_match_state ( state) {
90+ last_match =
91+ Some ( HalfMatch :: new ( dfa. match_pattern ( state, 0 ) , i) ) ;
92+ } else if dfa. is_dead_state ( state) {
93+ return Ok ( last_match) ;
94+ } else if dfa. is_quit_state ( state) {
95+ // It is possible to enter into a quit state after
96+ // observing a match has occurred. In that case, we
97+ // should return the match instead of an error.
98+ if last_match. is_some ( ) {
99+ return Ok ( last_match) ;
100+ }
101+ return Err ( MatchError :: quit ( b, i) ) ;
102+ }
103+ // Implementors may also want to check for start or accel
104+ // states and handle them differently for performance
105+ // reasons. But it is not necessary for correctness.
106+ }
107+ }
108+ // Matches are always delayed by 1 byte, so we must explicitly walk
109+ // the special "EOI" transition at the end of the search.
110+ state = dfa. next_eoi_state ( state) ;
111+ if dfa. is_match_state ( state) {
112+ last_match = Some ( HalfMatch :: new (
113+ dfa. match_pattern ( state, 0 ) ,
114+ haystack. len ( ) ,
115+ ) ) ;
116+ }
117+ Ok ( last_match)
118+ }
119+
120+ fn check_impl (
121+ dfa : impl Automaton ,
122+ haystack : & str ,
123+ pat : usize ,
124+ offset : usize ,
125+ ) -> Result < ( ) , Box < dyn Error > > {
126+ let haystack = haystack. as_bytes ( ) ;
127+ let mat = find ( & dfa, haystack) ?. unwrap ( ) ;
128+ assert_eq ! ( mat. pattern( ) . as_usize( ) , pat) ;
129+ assert_eq ! ( mat. offset( ) , offset) ;
130+ Ok ( ( ) )
131+ }
132+
133+ fn check (
134+ dfa : & dense:: DFA < Vec < u32 > > ,
135+ haystack : & str ,
136+ pat : usize ,
137+ offset : usize ,
138+ ) -> Result < ( ) , Box < dyn Error > > {
139+ check_impl ( dfa, haystack, pat, offset) ?;
140+ check_impl ( dfa. to_sparse ( ) ?, haystack, pat, offset) ?;
141+ Ok ( ( ) )
142+ }
143+
144+ let dfa = dense:: DFA :: new ( r"[a-z]+" ) ?;
145+ let haystack = "123 foobar 4567" ;
146+ check ( & dfa, haystack, 0 , 10 ) ?;
147+
148+ let dfa = dense:: DFA :: new ( r"[0-9]{4}" ) ?;
149+ let haystack = "123 foobar 4567" ;
150+ check ( & dfa, haystack, 0 , 15 ) ?;
151+
152+ let dfa = dense:: DFA :: new_many ( & [ r"[a-z]+" , r"[0-9]+" ] ) ?;
153+ let haystack = "123 foobar 4567" ;
154+ check ( & dfa, haystack, 1 , 3 ) ?;
155+ check ( & dfa, & haystack[ 3 ..] , 0 , 7 ) ?;
156+ check ( & dfa, & haystack[ 10 ..] , 1 , 5 ) ?;
157+
158+ Ok ( ( ) )
159+ }
0 commit comments