Skip to content

Add regex sets. #173

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 22, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions HACKING.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ the NFA algorithm, because it was one fewer epsilon transition that it had to
follow.

There exist more instructions and they are defined and documented in
src/inst.rs.
src/prog.rs.

Compilation has several knobs and a few unfortunately complicated invariants.
Namely, the output of compilation can be one of two types of programs: a
Expand Down Expand Up @@ -163,7 +163,7 @@ engine (or engines) to use.

The logic for choosing which engine to execute is in src/exec.rs and is
documented on the Exec type. Exec values collection regular expression
Programs (defined in src/program.rs), which contain all the necessary tidbits
Programs (defined in src/prog.rs), which contain all the necessary tidbits
for actually executing a regular expression on search text.

For the most part, the execution logic is straight-forward and follows the
Expand Down
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,34 @@ fn some_helper_function(text: &str) -> bool {
Specifically, in this example, the regex will be compiled when it is used for
the first time. On subsequent uses, it will reuse the previous compilation.

### Usage: match multiple regular expressions simultaneously

This demonstrates how to use a `RegexSet` to match multiple (possibly
overlapping) regular expressions in a single scan of the search text:

```rust
use regex::RegexSet;

let set = RegexSet::new(&[
r"\w+",
r"\d+",
r"\pL+",
r"foo",
r"bar",
r"barfoo",
r"foobar",
]).unwrap();

// Iterate over and collect all of the matches.
let matches: Vec<_> = set.matches("foobar").into_iter().collect();
assert_eq!(matches, vec![0, 2, 3, 4, 6]);

// You can also test whether a particular regex matched:
let matches = set.matches("foobar");
assert!(!matches.matched(5));
assert!(matches.matched(6));
```

### Usage: `regex!` compiler plugin

The `regex!` compiler plugin will compile your regexes at compile time. **This
Expand Down
27 changes: 14 additions & 13 deletions benches/bench_dynamic_compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,54 +8,55 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use regex_syntax::Expr;
use test::Bencher;

use regex::internal::ProgramBuilder;
use regex::internal::Compiler;

#[bench]
fn compile_simple(b: &mut Bencher) {
b.iter(|| {
let re = r"^bc(d|e)*$";
ProgramBuilder::new(&re).compile().unwrap()
let re = Expr::parse(r"^bc(d|e)*$").unwrap();
Compiler::new().compile(&[re]).unwrap()
});
}

#[bench]
fn compile_simple_bytes(b: &mut Bencher) {
b.iter(|| {
let re = r"^bc(d|e)*$";
ProgramBuilder::new(&re).bytes(true).compile().unwrap()
let re = Expr::parse(r"^bc(d|e)*$").unwrap();
Compiler::new().bytes(true).compile(&[re]).unwrap()
});
}

#[bench]
fn compile_small(b: &mut Bencher) {
b.iter(|| {
let re = r"\p{L}|\p{N}|\s|.|\d";
ProgramBuilder::new(&re).compile().unwrap()
let re = Expr::parse(r"\p{L}|\p{N}|\s|.|\d").unwrap();
Compiler::new().compile(&[re]).unwrap()
});
}

#[bench]
fn compile_small_bytes(b: &mut Bencher) {
b.iter(|| {
let re = r"\p{L}|\p{N}|\s|.|\d";
ProgramBuilder::new(&re).bytes(true).compile().unwrap()
let re = Expr::parse(r"\p{L}|\p{N}|\s|.|\d").unwrap();
Compiler::new().bytes(true).compile(&[re]).unwrap()
});
}

#[bench]
fn compile_huge(b: &mut Bencher) {
b.iter(|| {
let re = r"\p{L}{100}";
ProgramBuilder::new(&re).compile().unwrap()
let re = Expr::parse(r"\p{L}{100}").unwrap();
Compiler::new().compile(&[re]).unwrap()
});
}

#[bench]
fn compile_huge_bytes(b: &mut Bencher) {
b.iter(|| {
let re = r"\p{L}{100}";
ProgramBuilder::new(&re).bytes(true).compile().unwrap()
let re = Expr::parse(r"\p{L}{100}").unwrap();
Compiler::new().bytes(true).compile(&[re]).unwrap()
});
}
19 changes: 19 additions & 0 deletions examples/set.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
extern crate regex;

use regex::RegexSet;

fn main() {
let res = &[
"abc",
"xyzz",
"^[ga-fh-z]+$",
];
let text = "abcggggggggxyz";
let set = RegexSet::new(res).unwrap();
println!("{:?}", set);
let m = set.is_match("abcggggggggxyz");
println!("match? {:?}", m);
for mi in set.matches(text) {
println!("{:?}", mi);
}
}
27 changes: 19 additions & 8 deletions regex-syntax/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,19 @@ pub enum Repeater {
},
}

impl Repeater {
/// Returns true if and only if this repetition can match the empty string.
fn matches_empty(&self) -> bool {
use self::Repeater::*;
match *self {
ZeroOrOne => true,
ZeroOrMore => true,
OneOrMore => false,
Range { min, .. } => min == 0,
}
}
}

/// A character class.
///
/// A character class has a canonical format that the parser guarantees. Its
Expand Down Expand Up @@ -315,7 +328,9 @@ impl Expr {
/// the beginning of text.
pub fn is_anchored_start(&self) -> bool {
match *self {
Repeat { ref e, .. } => e.is_anchored_start(),
Repeat { ref e, r, .. } => {
!r.matches_empty() && e.is_anchored_start()
}
Group { ref e, .. } => e.is_anchored_start(),
Concat(ref es) => es[0].is_anchored_start(),
Alternate(ref es) => es.iter().all(|e| e.is_anchored_start()),
Expand All @@ -328,7 +343,9 @@ impl Expr {
/// end of the text.
pub fn is_anchored_end(&self) -> bool {
match *self {
Repeat { ref e, .. } => e.is_anchored_end(),
Repeat { ref e, r, .. } => {
!r.matches_empty() && e.is_anchored_end()
}
Group { ref e, .. } => e.is_anchored_end(),
Concat(ref es) => es[es.len() - 1].is_anchored_end(),
Alternate(ref es) => es.iter().all(|e| e.is_anchored_end()),
Expand Down Expand Up @@ -1059,9 +1076,6 @@ mod tests {
assert!(e("^a|^b").is_anchored_start());
assert!(e("(^a)|(^b)").is_anchored_start());
assert!(e("(^(a|b))").is_anchored_start());
assert!(e("^*").is_anchored_start());
assert!(e("(^)*").is_anchored_start());
assert!(e("((^)*)*").is_anchored_start());

assert!(!e("^a|b").is_anchored_start());
assert!(!e("a|^b").is_anchored_start());
Expand All @@ -1074,9 +1088,6 @@ mod tests {
assert!(e("a$|b$").is_anchored_end());
assert!(e("(a$)|(b$)").is_anchored_end());
assert!(e("((a|b)$)").is_anchored_end());
assert!(e("$*").is_anchored_end());
assert!(e("($)*").is_anchored_end());
assert!(e("(($)*)*").is_anchored_end());

assert!(!e("a$|b").is_anchored_end());
assert!(!e("a|b$").is_anchored_end());
Expand Down
4 changes: 4 additions & 0 deletions regex_macros/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ path = ".."
version = "0.1"
features = ["pattern"]

[dependencies.regex-syntax]
path = "../regex-syntax"
version = "0.2"

[dev-dependencies]
lazy_static = "0.1"
rand = "0.3"
Expand Down
52 changes: 31 additions & 21 deletions regex_macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@
#![feature(plugin_registrar, quote, rustc_private)]

extern crate regex;
extern crate syntax;
extern crate regex_syntax;
extern crate rustc_plugin;
extern crate syntax;

use std::collections::BTreeMap;
use std::usize;

use syntax::ast;
use syntax::codemap;
Expand All @@ -32,7 +36,8 @@ use syntax::ptr::P;

use rustc_plugin::Registry;

use regex::internal::{Inst, EmptyLook, Program, ProgramBuilder};
use regex::internal::{Compiler, EmptyLook, Inst, Program};
use regex_syntax::Expr;

/// For the `regex!` syntax extension. Do not use.
#[plugin_registrar]
Expand Down Expand Up @@ -67,15 +72,21 @@ fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
};
// We use the largest possible size limit because this is happening at
// compile time. We trust the programmer.
let bprog = ProgramBuilder::new(&regex).size_limit(::std::usize::MAX);
let prog = match bprog.compile() {
let expr = match Expr::parse(&regex) {
Ok(expr) => expr,
Err(err) => {
cx.span_err(sp, &err.to_string());
return DummyResult::any(sp)
}
};
let prog = match Compiler::new().size_limit(usize::MAX).compile(&[expr]) {
Ok(re) => re,
Err(err) => {
cx.span_err(sp, &err.to_string());
return DummyResult::any(sp)
}
};
let names = prog.cap_names.iter().cloned().collect();
let names = prog.captures.iter().cloned().collect();
let mut gen = NfaGen {
cx: &*cx,
sp: sp,
Expand All @@ -98,8 +109,8 @@ impl<'a> NfaGen<'a> {
fn code(&mut self) -> P<ast::Expr> {
// Most or all of the following things are used in the quasiquoted
// expression returned.
let num_cap_locs = 2 * self.prog.num_captures();
let num_insts = self.prog.insts.len();
let num_cap_locs = 2 * self.prog.captures.len();
let num_insts = self.prog.len();
let cap_names = self.vec_expr(self.names.iter(),
&mut |cx, name| match *name {
Some(ref name) => {
Expand All @@ -109,21 +120,20 @@ impl<'a> NfaGen<'a> {
None => cx.expr_none(self.sp),
}
);
let named_groups = {
let mut named_groups = ::std::collections::BTreeMap::new();
let capture_name_idx = {
let mut capture_name_idx = BTreeMap::new();
for (i, name) in self.names.iter().enumerate() {
if let Some(ref name) = *name {
named_groups.insert(name.to_owned(), i);
capture_name_idx.insert(name.to_owned(), i);
}
}
self.vec_expr(named_groups.iter(),
self.vec_expr(capture_name_idx.iter(),
&mut |cx, (name, group_idx)|
quote_expr!(cx, ($name, $group_idx))
)
};

let prefix_anchor = self.prog.anchored_begin;

let is_anchored_start = self.prog.is_anchored_start;
let step_insts = self.step_insts();
let add_insts = self.add_insts();
let regex = &*self.original;
Expand All @@ -135,9 +145,9 @@ impl<'a> NfaGen<'a> {
// the user is only warned about *their* unused variable/code, and not the
// unused code generated by regex!. See #14185 for an example.
#[allow(dead_code)]
static CAP_NAMES: &'static [Option<&'static str>] = &$cap_names;
static CAPTURES: &'static [Option<&'static str>] = &$cap_names;
#[allow(dead_code)]
static NAMED_GROUPS: &'static [(&'static str, usize)] = &$named_groups;
static CAPTURE_NAME_IDX: &'static [(&'static str, usize)] = &$capture_name_idx;

#[allow(dead_code)]
fn exec<'t>(
Expand Down Expand Up @@ -175,14 +185,14 @@ fn exec<'t>(
clist.empty(); nlist.empty();
'LOOP: loop {
if clist.size == 0 {
if matched || (!at.is_beginning() && $prefix_anchor) {
if matched || (!at.is_start() && $is_anchored_start) {
break;
}
// TODO: Prefix matching... Hmm.
// Prefix matching now uses a DFA, so I think this is
// going to require encoding that DFA statically.
}
if clist.size == 0 || (!$prefix_anchor && !matched) {
if clist.size == 0 || (!$is_anchored_start && !matched) {
self.add(clist, &mut caps, 0, at);
}
let at_next = self.input.at(at.next_pos());
Expand Down Expand Up @@ -322,8 +332,8 @@ fn exec<'t>(

::regex::Regex::Native(::regex::internal::ExNative {
original: $regex,
names: &CAP_NAMES,
groups: &NAMED_GROUPS,
names: &CAPTURES,
groups: &CAPTURE_NAME_IDX,
prog: exec,
})
})
Expand All @@ -332,7 +342,7 @@ fn exec<'t>(
// Generates code for the `add` method, which is responsible for adding
// zero-width states to the next queue of states to visit.
fn add_insts(&self) -> P<ast::Expr> {
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
let arms = self.prog.iter().enumerate().map(|(pc, inst)| {
let body = match *inst {
Inst::EmptyLook(ref inst) => {
let nextpc = inst.goto;
Expand Down Expand Up @@ -422,7 +432,7 @@ fn exec<'t>(
// Generates the code for the `step` method, which processes all states
// in the current queue that consume a single character.
fn step_insts(&self) -> P<ast::Expr> {
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
let arms = self.prog.iter().enumerate().map(|(pc, inst)| {
let body = match *inst {
Inst::Match => quote_expr!(self.cx, {
for (slot, val) in caps.iter_mut().zip(thread_caps.iter()) {
Expand Down
Loading