1
1
import inspect
2
2
import json
3
3
import re
4
- from typing import Callable , Union
4
+ from typing import Callable , Optional , Union
5
5
6
6
from jsonschema .protocols import Validator
7
7
from pydantic import BaseModel , create_model
38
38
}
39
39
40
40
41
- def build_regex_from_object (object : Union [str , Callable , BaseModel ]):
41
+ def build_regex_from_object (
42
+ object : Union [str , Callable , BaseModel ], whitespace_pattern : Optional [str ] = None
43
+ ):
42
44
"""Turn a JSON schema into a regex that matches any JSON object that follows
43
45
this schema.
44
46
@@ -54,6 +56,9 @@ def build_regex_from_object(object: Union[str, Callable, BaseModel]):
54
56
----------
55
57
schema
56
58
A string that represents a JSON Schema.
59
+ whitespace_pattern
60
+ Pattern to use for JSON syntactic whitespace (doesn't impact string literals)
61
+ Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"`
57
62
58
63
Returns
59
64
-------
@@ -83,10 +88,12 @@ def build_regex_from_object(object: Union[str, Callable, BaseModel]):
83
88
resolver = registry .resolver ()
84
89
85
90
content = schema .contents
86
- return to_regex (resolver , content )
91
+ return to_regex (resolver , content , whitespace_pattern )
87
92
88
93
89
- def to_regex (resolver : Resolver , instance : dict ):
94
+ def to_regex (
95
+ resolver : Resolver , instance : dict , whitespace_pattern : Optional [str ] = None
96
+ ):
90
97
"""Translate a JSON Schema instance into a regex that validates the schema.
91
98
92
99
Note
@@ -105,8 +112,15 @@ def to_regex(resolver: Resolver, instance: dict):
105
112
An object that resolves references to other instances within a schema
106
113
instance
107
114
The instance to translate
115
+ whitespace_pattern
116
+ Pattern to use for JSON syntactic whitespace (doesn't impact string literals)
117
+ Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"`
108
118
"""
109
119
120
+ # set whitespace pattern
121
+ if whitespace_pattern is None :
122
+ whitespace_pattern = WHITESPACE
123
+
110
124
if "properties" in instance :
111
125
regex = ""
112
126
regex += r"\{"
@@ -120,12 +134,12 @@ def to_regex(resolver: Resolver, instance: dict):
120
134
if any (is_required ):
121
135
last_required_pos = max ([i for i , value in enumerate (is_required ) if value ])
122
136
for i , (name , value ) in enumerate (properties .items ()):
123
- subregex = f'{ WHITESPACE } "{ name } "{ WHITESPACE } :{ WHITESPACE } '
124
- subregex += to_regex (resolver , value )
137
+ subregex = f'{ whitespace_pattern } "{ name } "{ whitespace_pattern } :{ whitespace_pattern } '
138
+ subregex += to_regex (resolver , value , whitespace_pattern )
125
139
if i < last_required_pos :
126
- subregex = f"{ subregex } { WHITESPACE } ,"
140
+ subregex = f"{ subregex } { whitespace_pattern } ,"
127
141
elif i > last_required_pos :
128
- subregex = f"{ WHITESPACE } ,{ subregex } "
142
+ subregex = f"{ whitespace_pattern } ,{ subregex } "
129
143
regex += subregex if is_required [i ] else f"({ subregex } )?"
130
144
# If no property is required, we have to create a possible pattern for each property in which
131
145
# it's the last one necessarilly present. Then, we add the others as optional before and after
@@ -134,41 +148,47 @@ def to_regex(resolver: Resolver, instance: dict):
134
148
else :
135
149
property_subregexes = []
136
150
for i , (name , value ) in enumerate (properties .items ()):
137
- subregex = f'{ WHITESPACE } "{ name } "{ WHITESPACE } :{ WHITESPACE } '
138
- subregex += to_regex (resolver , value )
151
+ subregex = f'{ whitespace_pattern } "{ name } "{ whitespace_pattern } :{ whitespace_pattern } '
152
+ subregex += to_regex (resolver , value , whitespace_pattern )
139
153
property_subregexes .append (subregex )
140
154
possible_patterns = []
141
155
for i in range (len (property_subregexes )):
142
156
pattern = ""
143
157
for subregex in property_subregexes [:i ]:
144
- pattern += f"({ subregex } { WHITESPACE } ,)?"
158
+ pattern += f"({ subregex } { whitespace_pattern } ,)?"
145
159
pattern += property_subregexes [i ]
146
160
for subregex in property_subregexes [i + 1 :]:
147
- pattern += f"({ WHITESPACE } ,{ subregex } )?"
161
+ pattern += f"({ whitespace_pattern } ,{ subregex } )?"
148
162
possible_patterns .append (pattern )
149
163
regex += f"({ '|' .join (possible_patterns )} )?"
150
164
151
- regex += f"{ WHITESPACE } " + r"\}"
165
+ regex += f"{ whitespace_pattern } " + r"\}"
152
166
153
167
return regex
154
168
155
169
# To validate against allOf, the given data must be valid against all of the
156
170
# given subschemas.
157
171
elif "allOf" in instance :
158
- subregexes = [to_regex (resolver , t ) for t in instance ["allOf" ]]
172
+ subregexes = [
173
+ to_regex (resolver , t , whitespace_pattern ) for t in instance ["allOf" ]
174
+ ]
159
175
subregexes_str = [f"{ subregex } " for subregex in subregexes ]
160
176
return rf"({ '' .join (subregexes_str )} )"
161
177
162
178
# To validate against `anyOf`, the given data must be valid against
163
179
# any (one or more) of the given subschemas.
164
180
elif "anyOf" in instance :
165
- subregexes = [to_regex (resolver , t ) for t in instance ["anyOf" ]]
181
+ subregexes = [
182
+ to_regex (resolver , t , whitespace_pattern ) for t in instance ["anyOf" ]
183
+ ]
166
184
return rf"({ '|' .join (subregexes )} )"
167
185
168
186
# To validate against oneOf, the given data must be valid against exactly
169
187
# one of the given subschemas.
170
188
elif "oneOf" in instance :
171
- subregexes = [to_regex (resolver , t ) for t in instance ["oneOf" ]]
189
+ subregexes = [
190
+ to_regex (resolver , t , whitespace_pattern ) for t in instance ["oneOf" ]
191
+ ]
172
192
173
193
xor_patterns = []
174
194
# json schema validation ensured there is no overlapping schemas in oneOf
@@ -195,7 +215,7 @@ def to_regex(resolver: Resolver, instance: dict):
195
215
elif "$ref" in instance :
196
216
path = f"{ instance ['$ref' ]} "
197
217
instance = resolver .lookup (path ).contents
198
- return to_regex (resolver , instance )
218
+ return to_regex (resolver , instance , whitespace_pattern )
199
219
200
220
# The type keyword may either be a string or an array:
201
221
# - If it's a string, it is the name of one of the basic types.
@@ -254,14 +274,14 @@ def to_regex(resolver: Resolver, instance: dict):
254
274
num_repeats = rf"{{{ max (min_items - 1 , 0 )} ,}}"
255
275
else :
256
276
if max_items < 1 :
257
- return rf"\[{ WHITESPACE } \]"
277
+ return rf"\[{ whitespace_pattern } \]"
258
278
num_repeats = rf"{{{ max (min_items - 1 , 0 )} ,{ max_items - 1 } }}"
259
279
260
280
allow_empty = "?" if min_items == 0 else ""
261
281
262
282
if "items" in instance :
263
- items_regex = to_regex (resolver , instance ["items" ])
264
- return rf"\[{ WHITESPACE } (({ items_regex } )(,{ WHITESPACE } ({ items_regex } )){ num_repeats } ){ allow_empty } { WHITESPACE } \]"
283
+ items_regex = to_regex (resolver , instance ["items" ], whitespace_pattern )
284
+ return rf"\[{ whitespace_pattern } (({ items_regex } )(,{ whitespace_pattern } ({ items_regex } )){ num_repeats } ){ allow_empty } { whitespace_pattern } \]"
265
285
else :
266
286
# Here we need to make the choice to exclude generating list of objects
267
287
# if the specification of the object is not given, even though a JSON
@@ -273,8 +293,8 @@ def to_regex(resolver: Resolver, instance: dict):
273
293
{"type" : "integer" },
274
294
{"type" : "string" },
275
295
]
276
- regexes = [to_regex (resolver , t ) for t in types ]
277
- return rf"\[{ WHITESPACE } ({ '|' .join (regexes )} )(,{ WHITESPACE } ({ '|' .join (regexes )} )){ num_repeats } ){ allow_empty } { WHITESPACE } \]"
296
+ regexes = [to_regex (resolver , t , whitespace_pattern ) for t in types ]
297
+ return rf"\[{ whitespace_pattern } ({ '|' .join (regexes )} )(,{ whitespace_pattern } ({ '|' .join (regexes )} )){ num_repeats } ){ allow_empty } { whitespace_pattern } \]"
278
298
279
299
elif instance_type == "boolean" :
280
300
return type_to_regex ["boolean" ]
@@ -287,7 +307,9 @@ def to_regex(resolver: Resolver, instance: dict):
287
307
# if the specification of the object is not give, even though a JSON
288
308
# object that contains an object here would be valid under the specification.
289
309
regexes = [
290
- to_regex (resolver , {"type" : t }) for t in instance_type if t != "object"
310
+ to_regex (resolver , {"type" : t }, whitespace_pattern )
311
+ for t in instance_type
312
+ if t != "object"
291
313
]
292
314
return rf"({ '|' .join (regexes )} )"
293
315
0 commit comments