Skip to content

Commit c2b0f4f

Browse files
Andrew Lapprlouf
authored andcommitted
Add a grammar modules with ready-to-use grammars
1 parent 5d67a5a commit c2b0f4f

File tree

8 files changed

+183
-0
lines changed

8 files changed

+183
-0
lines changed

docs/quickstart.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,23 @@ print(result)
183183
# 4*5*3*2*1/6*4*3*2*1/2*1*1*1/4*1*1*1/2*1*1*1/2*1*1/2*1*1*5*1/2*2*1*1/2*1*1*6*1*1/2*1*1*1*1*2*1*1*1*1
184184
```
185185

186+
187+
EBNF grammars can be cumbersome to write. This is why Outlines provides grammar definitions in the `outlines.grammars.` module
188+
189+
```python
190+
from outlines import models, generate, grammars
191+
192+
model = models.transformers("mistralai/Mistral-7B-v0.1")
193+
generator = generate.cfg(model, grammars.arithmetic, max_tokens=100)
194+
195+
result = generator("Write a series of operations on integers that return the number 5 ")
196+
print(result)
197+
# 100-2-75+50-18+27-501.
198+
```
199+
200+
The available grammars are listed [here](https://github.com/outlines-dev/outlines/tree/main/outlines/grammars).
201+
202+
186203
### Regex-guided generation
187204

188205
Slightly simpler, but no less useful, Outlines can generate text that is in the language of a [regular expression](https://www.regular-expressions.info/tutorial.html). For instance to force the model to generate IP addresses:

outlines/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Outlines is a Generative Model Programming Framework."""
22
import outlines.generate
3+
import outlines.grammars
34
import outlines.models
45
import outlines.text.generate
56
from outlines.base import vectorize
@@ -14,4 +15,5 @@
1415
"Function",
1516
"prompt",
1617
"vectorize",
18+
"grammars",
1719
]

outlines/fsm/fsm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from lark import Lark
55

66
# from outlines.fsm.parsing import PartialLark
7+
from outlines import grammars
78
from outlines.caching import cache
89
from outlines.fsm.regex import create_fsm_index_tokenizer, make_deterministic_fsm
910

@@ -200,6 +201,7 @@ def __init__(self, cfg_string: str, tokenizer: "Tokenizer"):
200201
propagate_positions=False,
201202
maybe_placeholders=False,
202203
regex=True,
204+
import_paths=[grammars.GRAMMAR_PATH],
203205
)
204206
self.terminal_regexps = dict()
205207
for terminal in self.parser.terminals:

outlines/grammars.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from pathlib import Path
2+
3+
GRAMMAR_PATH = Path(__file__).parent / "grammars"
4+
5+
6+
def read_grammar(grammar_file_name, base_grammar_path=GRAMMAR_PATH):
7+
"""Read grammar file from default grammar path"""
8+
full_path = base_grammar_path / grammar_file_name
9+
with open(full_path) as file:
10+
return file.read()
11+
12+
13+
arithmetic = read_grammar("arithmetic.lark")
14+
json = read_grammar("json.lark")

outlines/grammars/arithmetic.lark

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
?start: sum
2+
3+
?sum: product
4+
| sum "+" product -> add
5+
| sum "-" product -> sub
6+
7+
?product: atom
8+
| product "*" atom -> mul
9+
| product "/" atom -> div
10+
11+
?atom: NUMBER -> number
12+
| "-" atom -> neg
13+
| "(" sum ")"
14+
15+
%import common.NUMBER
16+
%import common.WS_INLINE
17+
18+
%ignore WS_INLINE

outlines/grammars/common.lark

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// Adapted from https://github.com/lark-parser/lark/blob/master/lark/grammars/common.lark
2+
3+
// Lark License:
4+
// Copyright © 2017 Erez Shinan
5+
//
6+
// Permission is hereby granted, free of charge, to any person obtaining a copy of
7+
// this software and associated documentation files (the "Software"), to deal in
8+
// the Software without restriction, including without limitation the rights to
9+
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
10+
// the Software, and to permit persons to whom the Software is furnished to do so,
11+
// subject to the following conditions:
12+
//
13+
// The above copyright notice and this permission notice shall be included in all
14+
// copies or substantial portions of the Software.
15+
//
16+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
18+
// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
19+
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
20+
// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21+
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22+
23+
24+
// Basic terminals for common use
25+
26+
27+
//
28+
// Numbers
29+
//
30+
31+
DIGIT: "0".."9"
32+
HEXDIGIT: "a".."f"|"A".."F"|DIGIT
33+
34+
INT: DIGIT+
35+
SIGNED_INT: ["+"|"-"] INT
36+
DECIMAL: INT "." INT? | "." INT
37+
38+
// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
39+
_EXP: ("e"|"E") SIGNED_INT
40+
FLOAT: INT _EXP | DECIMAL _EXP?
41+
SIGNED_FLOAT: ["+"|"-"] FLOAT
42+
43+
NUMBER: FLOAT | INT
44+
SIGNED_NUMBER: ["+"|"-"] NUMBER
45+
46+
//
47+
// TODO: Working escaped_string
48+
//
49+
UNESCAPED_STRING: /\"[^"]*\"/
50+
51+
52+
53+
//
54+
// Names (Variables)
55+
//
56+
LCASE_LETTER: "a".."z"
57+
UCASE_LETTER: "A".."Z"
58+
59+
LETTER: UCASE_LETTER | LCASE_LETTER
60+
WORD: LETTER+
61+
62+
CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*
63+
64+
65+
//
66+
// Whitespace
67+
//
68+
WS_INLINE: (" "|/\t/)+
69+
WS: /[ \t\f\r\n]/+
70+
71+
CR : /\r/
72+
LF : /\n/
73+
NEWLINE: (CR? LF)+
74+
75+
76+
// Comments
77+
SH_COMMENT: /#[^\n]*/
78+
CPP_COMMENT: /\/\/[^\n]*/
79+
C_COMMENT: "/*" /(.|\n)*?/ "*/"
80+
SQL_COMMENT: /--[^\n]*/

outlines/grammars/json.lark

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
?start: value
2+
3+
?value: object
4+
| array
5+
| UNESCAPED_STRING
6+
| SIGNED_NUMBER -> number
7+
| "true" -> true
8+
| "false" -> false
9+
| "null" -> null
10+
11+
array : "[" [value ("," value)*] "]"
12+
object : "{" [pair ("," pair)*] "}"
13+
pair : UNESCAPED_STRING ":" value
14+
15+
%import common.UNESCAPED_STRING
16+
%import common.SIGNED_NUMBER
17+
%import common.WS
18+
19+
%ignore WS

tests/test_grammars.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import pytest
2+
3+
import outlines.grammars as grammars
4+
from outlines.fsm.fsm import CFGFSM
5+
6+
7+
@pytest.mark.parametrize("grammar", [grammars.json, grammars.arithmetic])
8+
def test_grammar_module(grammar):
9+
class MockTokenizer:
10+
vocabulary = {"(": 1, ")": 2, "a": 3, "eos": 4}
11+
special_tokens = {"eos"}
12+
eos_token = "eos"
13+
eos_token_id = 4
14+
15+
def convert_token_to_string(self, token):
16+
return token
17+
18+
@property
19+
def inverse_vocabulary(self):
20+
return {v: k for k, v in self.vocabulary.items()}
21+
22+
def decode(self, token_ids):
23+
return [self.inverse_vocabulary[t] for t in token_ids]
24+
25+
cfg_str = """
26+
start: s
27+
s: "(" s ")" | /a+/
28+
"""
29+
tokenizer = MockTokenizer()
30+
fsm = CFGFSM(cfg_str, tokenizer)
31+
assert isinstance(fsm, CFGFSM)

0 commit comments

Comments
 (0)