Skip to content

Commit 2978cb7

Browse files
asarhaddonkanaka
authored andcommitted
ocaml: rewrite the reader
A reference for the only mutable part reduces the boilerplate. A separate tokenizer is not necessary for S-expressions.
1 parent 3a91b6d commit 2978cb7

File tree

1 file changed

+61
-111
lines changed

1 file changed

+61
-111
lines changed

impls/ocaml/reader.ml

Lines changed: 61 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -1,115 +1,65 @@
1-
module T = Types.Types
2-
(* ^file ^module *)
1+
open Str (* not reentrant, but simple and always available *)
2+
open Types
33

4-
let find_re re str =
5-
List.map
6-
(function Str.Delim x -> x | Str.Text x -> "impossible!")
7-
(List.filter
8-
(function Str.Delim x -> true | Str.Text x -> false)
9-
(Str.full_split re str))
4+
let separator_re = regexp "\\([, \t\n]\\|;[^\n]*\\)+"
5+
let number_re = regexp "-?[0-9]+"
6+
let chars = "[^][, \t\n;(){}'`~@^\"]+"
7+
let keyword_re = regexp (":\\(" ^ chars ^ "\\)")
8+
let symbol_re = regexp chars
9+
let string_re = regexp {|"\(\(\\[\\n"]\|[^\\"]\)*\)"|}
10+
let escape_re = regexp {|\\.|}
11+
let quote_re = regexp_string "'"
12+
let quasiquote_re = regexp_string "`"
13+
let deref_re = regexp_string "@"
14+
let unquote_re = regexp_string "~"
15+
let sp_unq_re = regexp_string "~@"
16+
let with_meta_re = regexp_string "^"
17+
let list_re = regexp_string "("
18+
let map_re = regexp_string "{"
19+
let vector_re = regexp_string "["
20+
let close_re = regexp "[])}]" (* so "[1 2)" is accepted as a vector *)
1021

11-
let gsub re f str =
12-
String.concat ""
13-
(List.map
14-
(function Str.Delim x -> f x | Str.Text x -> x)
15-
(Str.full_split re str))
16-
17-
let token_re =
18-
Str.regexp
19-
"~@\\|[][{}()'`~^@]\\|\"\\(\\\\.\\|[^\"]\\)*\"?\\|;.*\\|[^][ \n\
20-
{}('\"`,;)]*"
21-
22-
let string_re = Str.regexp "\"\\(\\\\.\\|[^\\\\\"]\\)*\""
23-
24-
type reader = { form : Types.mal_type; tokens : string list }
25-
type list_reader = { list_form : Types.mal_type list; tokens : string list }
26-
27-
let unescape_string token =
28-
if Str.string_match string_re token 0 then
29-
let without_quotes = String.sub token 1 (String.length token - 2) in
30-
gsub (Str.regexp "\\\\.")
31-
(function "\\n" -> "\n" | x -> String.sub x 1 1)
32-
without_quotes
33-
else raise (Invalid_argument "expected '\"', got EOF")
34-
35-
let read_atom token =
36-
match token with
37-
| "nil" -> T.Nil
38-
| "true" -> T.Bool true
39-
| "false" -> T.Bool false
40-
| _ -> (
41-
match token.[0] with
42-
| '0' .. '9' -> T.Int (int_of_string token)
43-
| '-' -> (
44-
match String.length token with
45-
| 1 -> T.Symbol token
46-
| _ -> (
47-
match token.[1] with
48-
| '0' .. '9' -> T.Int (int_of_string token)
49-
| _ -> T.Symbol token))
50-
| '"' -> T.String (unescape_string token)
51-
| ':' -> T.Keyword (Str.replace_first (Str.regexp "^:") "" token)
52-
| _ -> T.Symbol token)
53-
54-
let rec read_list eol list_reader =
55-
match list_reader.tokens with
56-
| [] ->
57-
raise (Invalid_argument (Format.asprintf "expected '%s', got EOF" eol))
58-
| token :: tokens ->
59-
if Str.string_match (Str.regexp eol) token 0 then
60-
{ list_form = list_reader.list_form; tokens }
61-
else if token.[0] = ';' then
62-
read_list eol { list_form = list_reader.list_form; tokens }
63-
else
64-
let reader = read_form list_reader.tokens in
65-
read_list eol
66-
{
67-
list_form = list_reader.list_form @ [ reader.form ];
68-
tokens = reader.tokens;
69-
}
70-
71-
and read_quote sym tokens =
72-
let reader = read_form tokens in
73-
{ form = Types.list [ T.Symbol sym; reader.form ]; tokens = reader.tokens }
74-
75-
and read_form all_tokens =
76-
match all_tokens with
77-
| [] -> raise (Invalid_argument "no form found in the given string")
78-
| token :: tokens -> (
79-
match token with
80-
| "'" -> read_quote "quote" tokens
81-
| "`" -> read_quote "quasiquote" tokens
82-
| "~" -> read_quote "unquote" tokens
83-
| "~@" -> read_quote "splice-unquote" tokens
84-
| "@" -> read_quote "deref" tokens
85-
| "^" ->
86-
let meta = read_form tokens in
87-
let value = read_form meta.tokens in
88-
{
89-
form = Types.list [ T.Symbol "with-meta"; value.form; meta.form ];
90-
tokens = value.tokens;
91-
}
92-
| "(" ->
93-
let list_reader = read_list ")" { list_form = []; tokens } in
94-
{
95-
form = Types.list list_reader.list_form;
96-
tokens = list_reader.tokens;
97-
}
98-
| "{" ->
99-
let list_reader = read_list "}" { list_form = []; tokens } in
100-
{
101-
form = Types.list_into_map Types.MalMap.empty list_reader.list_form;
102-
tokens = list_reader.tokens;
103-
}
104-
| "[" ->
105-
let list_reader = read_list "]" { list_form = []; tokens } in
106-
{
107-
form = Types.vector list_reader.list_form;
108-
tokens = list_reader.tokens;
109-
}
110-
| _ ->
111-
if token.[0] = ';' then read_form tokens
112-
else { form = read_atom token; tokens })
22+
let unescape str =
23+
let e = match_end () - 1 in
24+
if str.[e] == 'n' then "\n" else String.sub str e 1
11325

11426
let read_str str =
115-
(read_form (List.filter (( <> ) "") (find_re token_re str))).form
27+
(* !p is the currently parsed position inside str *)
28+
let rec read pattern p =
29+
let result = string_match pattern str !p in
30+
if result then p := match_end ();
31+
result
32+
and read_list p =
33+
ignore (read separator_re p);
34+
if read close_re p then []
35+
else
36+
(* Parse the first form before the rest of the list *)
37+
let first = read_form p in
38+
first :: read_list p
39+
and read_form p =
40+
ignore (read separator_re p);
41+
if read number_re p then Types.Int (int_of_string (matched_string str))
42+
else if read keyword_re p then Keyword (matched_group 1 str)
43+
else if read symbol_re p then
44+
match matched_string str with
45+
| "nil" -> Nil
46+
| "true" -> Bool true
47+
| "false" -> Bool false
48+
| t -> Symbol t
49+
else if read string_re p then
50+
String (global_substitute escape_re unescape (matched_group 1 str))
51+
else if read quote_re p then list [ Symbol "quote"; read_form p ]
52+
else if read quasiquote_re p then list [ Symbol "quasiquote"; read_form p ]
53+
else if read deref_re p then list [ Symbol "deref"; read_form p ]
54+
else if read sp_unq_re p then list [ Symbol "splice-unquote"; read_form p ]
55+
else if read unquote_re p then list [ Symbol "unquote"; read_form p ]
56+
else if read with_meta_re p then
57+
(* Parse the metadata before the value *)
58+
let meta = read_form p in
59+
list [ Symbol "with-meta"; read_form p; meta ]
60+
else if read list_re p then list (read_list p)
61+
else if read vector_re p then vector (read_list p)
62+
else if read map_re p then list_into_map MalMap.empty (read_list p)
63+
else raise (Invalid_argument "unexpected EOF ] } ) or string escape")
64+
in
65+
read_form (ref 0)

0 commit comments

Comments
 (0)