Add a grammar modules with ready-to-use grammars

Andrew Lapp · rlouf · commit c2b0f4fb1886 · 2024-01-26T16:37:31.000+01:00
diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -183,6 +183,23 @@ print(result)
 # 4*5*3*2*1/6*4*3*2*1/2*1*1*1/4*1*1*1/2*1*1*1/2*1*1/2*1*1*5*1/2*2*1*1/2*1*1*6*1*1/2*1*1*1*1*2*1*1*1*1
 ```
 
+
+EBNF grammars can be cumbersome to write. This is why Outlines provides grammar definitions in the `outlines.grammars.` module
+
+```python
+from outlines import models, generate, grammars
+
+model = models.transformers("mistralai/Mistral-7B-v0.1")
+generator = generate.cfg(model, grammars.arithmetic, max_tokens=100)
+
+result = generator("Write a series of operations on integers that return the number 5 ")
+print(result)
+# 100-2-75+50-18+27-501.
+```
+
+The available grammars are listed [here](https://github.com/outlines-dev/outlines/tree/main/outlines/grammars).
+
+
 ### Regex-guided generation
 
 Slightly simpler, but no less useful, Outlines can generate text that is in the language of a [regular expression](https://www.regular-expressions.info/tutorial.html). For instance to force the model to generate IP addresses:
diff --git a/outlines/__init__.py b/outlines/__init__.py
@@ -1,5 +1,6 @@
 """Outlines is a Generative Model Programming Framework."""
 import outlines.generate
+import outlines.grammars
 import outlines.models
 import outlines.text.generate
 from outlines.base import vectorize
@@ -14,4 +15,5 @@
     "Function",
     "prompt",
     "vectorize",
+    "grammars",
 ]
diff --git a/outlines/fsm/fsm.py b/outlines/fsm/fsm.py
@@ -4,6 +4,7 @@
 from lark import Lark
 
 # from outlines.fsm.parsing import PartialLark
+from outlines import grammars
 from outlines.caching import cache
 from outlines.fsm.regex import create_fsm_index_tokenizer, make_deterministic_fsm
 
@@ -200,6 +201,7 @@ def __init__(self, cfg_string: str, tokenizer: "Tokenizer"):
             propagate_positions=False,
             maybe_placeholders=False,
             regex=True,
+            import_paths=[grammars.GRAMMAR_PATH],
         )
         self.terminal_regexps = dict()
         for terminal in self.parser.terminals:
diff --git a/outlines/grammars.py b/outlines/grammars.py
@@ -0,0 +1,14 @@
+from pathlib import Path
+
+GRAMMAR_PATH = Path(__file__).parent / "grammars"
+
+
+def read_grammar(grammar_file_name, base_grammar_path=GRAMMAR_PATH):
+    """Read grammar file from default grammar path"""
+    full_path = base_grammar_path / grammar_file_name
+    with open(full_path) as file:
+        return file.read()
+
+
+arithmetic = read_grammar("arithmetic.lark")
+json = read_grammar("json.lark")
diff --git a/outlines/grammars/arithmetic.lark b/outlines/grammars/arithmetic.lark
@@ -0,0 +1,18 @@
+?start: sum
+
+?sum: product
+| sum "+" product   -> add
+| sum "-" product   -> sub
+
+?product: atom
+| product "*" atom  -> mul
+| product "/" atom  -> div
+
+?atom: NUMBER           -> number
+| "-" atom         -> neg
+| "(" sum ")"
+
+%import common.NUMBER
+%import common.WS_INLINE
+
+%ignore WS_INLINE
diff --git a/outlines/grammars/common.lark b/outlines/grammars/common.lark
@@ -0,0 +1,80 @@
+// Adapted from https://github.com/lark-parser/lark/blob/master/lark/grammars/common.lark
+
+// Lark License:
+// Copyright © 2017 Erez Shinan
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal in
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+// the Software, and to permit persons to whom the Software is furnished to do so,
+// subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+// Basic terminals for common use
+
+
+//
+// Numbers
+//
+
+DIGIT: "0".."9"
+HEXDIGIT: "a".."f"|"A".."F"|DIGIT
+
+INT: DIGIT+
+SIGNED_INT: ["+"|"-"] INT
+DECIMAL: INT "." INT? | "." INT
+
+// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
+_EXP: ("e"|"E") SIGNED_INT
+FLOAT: INT _EXP | DECIMAL _EXP?
+SIGNED_FLOAT: ["+"|"-"] FLOAT
+
+NUMBER: FLOAT | INT
+SIGNED_NUMBER: ["+"|"-"] NUMBER
+
+//
+// TODO: Working escaped_string
+//
+UNESCAPED_STRING: /\"[^"]*\"/
+
+
+
+//
+// Names (Variables)
+//
+LCASE_LETTER: "a".."z"
+UCASE_LETTER: "A".."Z"
+
+LETTER: UCASE_LETTER | LCASE_LETTER
+WORD: LETTER+
+
+CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*
+
+
+//
+// Whitespace
+//
+WS_INLINE: (" "|/\t/)+
+WS: /[ \t\f\r\n]/+
+
+CR : /\r/
+LF : /\n/
+NEWLINE: (CR? LF)+
+
+
+// Comments
+SH_COMMENT: /#[^\n]*/
+CPP_COMMENT: /\/\/[^\n]*/
+C_COMMENT: "/*" /(.|\n)*?/ "*/"
+SQL_COMMENT: /--[^\n]*/
diff --git a/outlines/grammars/json.lark b/outlines/grammars/json.lark
@@ -0,0 +1,19 @@
+?start: value
+
+?value: object
+| array
+| UNESCAPED_STRING
+| SIGNED_NUMBER      -> number
+| "true"             -> true
+| "false"            -> false
+| "null"             -> null
+
+array  : "[" [value ("," value)*] "]"
+object : "{" [pair ("," pair)*] "}"
+pair   : UNESCAPED_STRING ":" value
+
+%import common.UNESCAPED_STRING
+%import common.SIGNED_NUMBER
+%import common.WS
+
+%ignore WS
diff --git a/tests/test_grammars.py b/tests/test_grammars.py
@@ -0,0 +1,31 @@
+import pytest
+
+import outlines.grammars as grammars
+from outlines.fsm.fsm import CFGFSM
+
+
+@pytest.mark.parametrize("grammar", [grammars.json, grammars.arithmetic])
+def test_grammar_module(grammar):
+    class MockTokenizer:
+        vocabulary = {"(": 1, ")": 2, "a": 3, "eos": 4}
+        special_tokens = {"eos"}
+        eos_token = "eos"
+        eos_token_id = 4
+
+        def convert_token_to_string(self, token):
+            return token
+
+        @property
+        def inverse_vocabulary(self):
+            return {v: k for k, v in self.vocabulary.items()}
+
+        def decode(self, token_ids):
+            return [self.inverse_vocabulary[t] for t in token_ids]
+
+    cfg_str = """
+        start: s
+        s: "(" s ")" | /a+/
+    """
+    tokenizer = MockTokenizer()
+    fsm = CFGFSM(cfg_str, tokenizer)
+    assert isinstance(fsm, CFGFSM)