-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpcregex.ml
324 lines (288 loc) · 10.3 KB
/
pcregex.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
open Batteries
(** This module parses semi-Perl compatable Regular Expressions
*
* It is part of the NetSifter project
*
*)
type char_token = [
| `Char of int
| `Escape of char list
]
type stream_token =
[ char_token
| `Class of string
| `Substart
| `Substop
| `Caret (* ^ *)
| `Any (* . *)
| `Alternate (* | *)
| `Repeat of int * int option (* {3,} {2,4} {8} *)
]
type class_token =
[ char_token (* `Char of int, `Escape of char list *)
| `Range of int * int
| `Class of string
]
(** RegEx type *)
type ('a,'b) t =
| Union of ('a,'b) t list
| Concat of ('a,'b) t list
| Kleene of ('a,'b) t
| Value of 'a
| Accept of 'b
let epsilon = Union []
let catch_escape = function
| '\\'::'x'::h1::h2::t ->
Some (`Char (int_of_string (Printf.sprintf "0x%c%c" h1 h2)), t)
| '\\'::'x'::'{'::t ->
let rec loop acc = function
'}' :: t ->
Some (`Char (int_of_string (String.implode (List.rev acc))), t)
| c :: t -> loop (c::acc) t
| [] -> failwith "End reached looking for }"
in
loop ['x';'0'] t
| '\\'::'c'::x::t ->
let to_control_char c =
let upper_code = Char.uppercase c |> Char.code in
(upper_code lxor 0x60) in
Some (`Char (to_control_char x), t)
| '\\'::'n'::t -> Some (`Char (Char.code '\n'),t)
| '\\'::'r'::t -> Some (`Char (Char.code '\r'),t)
| '\\'::'t'::t -> Some (`Char (Char.code '\t'),t)
| '\\'::'a'::t -> Some (`Char 0x07,t) (* bell *)
| '\\'::'e'::t -> Some (`Char 0x1b,t) (* escape *)
| '\\'::'f'::t -> Some (`Char 0x0c,t) (* form feed *)
| '\\'::'v'::t -> Some (`Char 0x0b,t) (* vertical tab *)
| '\\'::x::t -> Some (`Escape [x], t)
| c::t -> Some (`Char (Char.code c), t)
| [] -> None
let catch_class char_list =
let acc = Buffer.create 15 in
let rec aux count = function
| '['::':'::'a'::'l'::'n'::'u'::'m'::':'::']'::b ->
Buffer.add_string acc "a-zA-Z0-9"; aux count b
| '['::':'::'s'::'p'::'a'::'c'::'e'::':'::']'::b ->
Buffer.add_string acc " \\t\\r\\n\\v\\f"; aux count b
(* TODO: COMPLETE LIST OF POSIX BRACKET EXPRESSIONS FROM HERE:
http://www.regular-expressions.info/posixbrackets.html *)
| '['::b -> (Buffer.add_char acc '[') ; aux (count+1) b
| ']'::b when count == 0 -> `Class (Buffer.contents acc), b
| ']'::b when count != 0 -> (Buffer.add_char acc ']') ; aux (count-1) b
| a::b -> (Buffer.add_char acc a) ; aux (count) b
| [] -> `Class (Buffer.contents acc), []
in
aux 0 char_list
let rec get_int acc = function
('0'..'9' as d) :: t -> get_int (10*acc + (Char.code d - 0x30)) t
| t -> acc, t
let catch_range chars =
let (v0,chars) = get_int 0 chars in
match chars with
',' :: chars ->
let (v1,chars) = get_int 0 chars in
( match chars with
'}'::chars -> `Repeat (v0,if v1 = 0 then None else Some v1), chars
| _ -> failwith "Expecting } for range"
)
| '}':: chars -> `Repeat (v0,Some v0), chars
| _ -> failwith "Expecting } or , for range"
let stream_tokenize str =
let rec make_token = function
| [] -> None
| '\\'::_ as l -> catch_escape l
| '['::t -> Some ( catch_class t )
| '(' :: '?' :: ':' :: t | '('::t
-> Some ( `Substart, t )
| ')'::t -> Some ( `Substop, t )
| '^'::t -> Some ( `Caret, t )
| '.'::t -> Some ( `Any, t )
| '|'::t -> Some ( `Alternate, t )
| '?'::t -> Some ( `Repeat (0,Some 1), t )
| '*'::t -> Some ( `Repeat (0,None), t )
| '+'::t -> Some ( `Repeat (1,None), t )
| '{'::t -> Some ( catch_range t )
| c::t (* ignores whitespace *)
when (c == '\n') || (c==' ')
|| (c=='\r') || (c=='\t') -> make_token t
| c::t -> Some ( (`Char (Char.code c)), t )
in
Enum.unfold (String.explode str) make_token ;;
let class_tokenize str =
let token_stream = Enum.unfold (String.explode str) catch_escape |> List.of_enum in
let rec parse_class = function
| [] -> None
| `Char a::`Char 45 (* - *)::`Char b::t ->
if a > b then parse_class t else Some (`Range (a,b), t)
| `Char c::t -> Some ( `Char c, t )
| `Escape e::t -> Some (`Escape e,t)
in
match token_stream with
| `Char 94 (* ^ *) :: t -> (false, Enum.unfold t parse_class)
| b -> (true, Enum.unfold b parse_class) ;;
let iset_of_class any set_of_escape str =
let aux acc = function
| `Char c -> ISet.add c acc
| `Range (lo,hi) -> ISet.add_range lo hi acc
| `Escape x -> ISet.union (set_of_escape x) acc
in
let (positive, tokens) = class_tokenize str in
if positive
then fold aux ISet.empty tokens
else ISet.diff any (fold aux ISet.empty tokens)
let rev_compare x y = - (Pervasives.compare x y)
let rec union_elim acc = function
[] -> List.sort_unique Pervasives.compare acc
| Union x :: t -> union_elim acc (x@t)
| h :: t -> union_elim (h::acc) t
let rec concat_elim acc = function
[] -> List.rev acc
| Concat x :: t -> concat_elim acc (x@t)
| h :: t -> concat_elim (h::acc) t
let rec reduce = function
| Union [x] -> reduce x
| Union l -> Union (union_elim [] l |> List.map reduce)
| Concat [] -> epsilon
| Concat [x] -> reduce x
| Concat (Kleene a :: Kleene b :: tl) when a=b -> reduce (Concat (Kleene a::tl))
| Concat l -> Concat (concat_elim [] l |> List.map reduce)
| Kleene x -> Kleene (reduce x)
| Value _ as e -> e
| Accept _ as e -> e
let rec print_regex printv printa ?root oc =
let self = print_regex printv printa ?root in
function
| x when (match root with Some r -> r==x | None -> false) ->
IO.nwrite oc "ROOT"
| Union [Concat []; reg] -> self oc reg; IO.write oc '?'
| Union regl -> List.print ~first:"(" ~sep:"|" ~last:")" self oc regl
| Concat regl -> List.print ~first:"" ~sep:"" ~last:"" self oc regl
| Kleene (Concat regl) -> List.print ~first:"(" ~sep:"" ~last:")" self oc regl; IO.write oc '*'
| Kleene reg -> self oc reg; IO.write oc '*'
| Value a -> printv oc a
| Accept i -> printa oc i
let print_char oc i =
IO.nwrite oc (Char.escaped (Char.chr i))
let print_range oc lo hi =
if lo = hi-1 then begin
print_char oc lo;
print_char oc hi;
end else if lo < hi then begin
print_char oc lo;
IO.nwrite oc "-";
print_char oc hi;
end else
print_char oc lo
(* IO.nwrite oc " "*)
let print_iset oc set =
match ISet.cardinal set with
256 -> IO.write oc '.'
| 1 -> ISet.iter_range (print_range oc) set
| _ ->
IO.write oc '[';
ISet.iter_range (print_range oc) set;
IO.write oc ']'
let print_iregex oc = print_regex print_iset ~root:(Obj.magic 0) oc
(* Returns a regex that matches any character in the string *)
let iset_of_string str =
let add_char acc c = ISet.add (Char.code c) acc in
String.fold_left add_char ISet.empty str
(** Takes a ascii str and returns a ISet.t t
assume that the regex is not anchored unless explicitly anchored *)
let regex_of_ascii_str ~dec str =
let stream = stream_tokenize str in
let escape_char_set = function (* TODO: implement more \. escapes *)
| 'n' -> iset_of_string "\n"
| 'r' -> iset_of_string "\r"
| 't' -> iset_of_string "\t"
| 'a' -> ISet.singleton 0x07 (* bell *)
| 'e' -> ISet.singleton 0x1b (* escape *)
| 'f' -> ISet.singleton 0x0c (* form feed *)
| 'v' -> ISet.singleton 0x0b (* vertical tab *)
| 'd' -> iset_of_string "0123456789"
| 's' -> iset_of_string " \t\r\n"
| x -> iset_of_string (String.of_char x)
in
let value_of_escape = function (* TODO: Implement more escape types *)
| [] -> failwith "End of string after escape"
| [x] -> escape_char_set x
| _ -> failwith "Unknown escape sequence"
in
let any = ISet.add_range 0 255 ISet.empty in
let regex_of_class str = Value (iset_of_class any value_of_escape str) in
let dup_n rx n = Concat (List.make n rx) in
let rec zero_thru_n rx n =
assert (n>0);
if n = 1 then Union [epsilon; rx]
else Union [epsilon; Concat [rx; zero_thru_n rx (n-1)]] in
let rec aux acc =
let mod_head f = match acc with [] -> assert false
| h :: t -> aux (f h :: t) in
match Enum.get stream with
| None -> Concat (List.rev acc)
| Some (`Char a) ->
aux ((Value (ISet.singleton a))::acc)
| Some (`Escape a) ->
aux (Value (value_of_escape a)::acc)
| Some (`Class a) ->
aux ((regex_of_class a)::acc)
| Some (`Substart) ->
aux ((aux [] )::acc)
| Some (`Substop) -> Concat (List.rev acc)
| Some (`Caret) -> aux (Value (iset_of_string "^")::acc)
| Some (`Any) -> aux ((Value any)::acc)
| Some (`Alternate) -> (* This is tricky *)
aux [Union [Concat (List.rev acc) ;aux [] ] ]
| Some (`Repeat (m,None)) -> (* unbounded *)
mod_head (fun g -> Concat [dup_n g m; Kleene g])
| Some (`Repeat (0, Some n)) ->
mod_head (fun g -> zero_thru_n g n)
| Some (`Repeat (m, Some n)) when n = m ->
mod_head (fun g -> dup_n g m)
| Some (`Repeat (m, Some n)) ->
mod_head (fun g -> Concat [dup_n g m; zero_thru_n g (n-m)])
in
let rx =
match Enum.peek stream with
| Some (`Caret) ->
Enum.junk stream;
reduce (aux [])
| _ -> reduce (Concat [Kleene (Value any); aux []])
in
reduce (Concat [rx; Accept dec])
;;
let match_char iset c = ISet.mem (Char.code c) iset
let regex_match match_val rx lst =
let rec loop = function
| Value _, [] -> None
| Value v, c::t -> if match_val v c then Some t else None
| Union [], _ -> None
| Union (h::t), str -> (* does this backtrack properly? *)
( match loop (h,str) with
None -> loop (Union t, str)
| Some t -> Some t )
| Concat [], str -> Some str
| Concat (h::t), str ->
( match loop (h,str) with
None -> None
| Some str_t -> loop (Concat t, str_t) )
| Kleene rx, str -> loop (Union [epsilon; Concat [rx; Kleene rx]],str)
| Accept _, str -> Some str
in
loop (rx,lst)
let regex_match_iset rx str =
match regex_match match_char rx (String.explode str) with
Some [] -> true
| Some _ -> false (* partial match *)
| None -> false
let line_to_regex ?(allow_comments=false) ~anchor (dec,line) =
if allow_comments && (String.length line < 1 || line.[0] = '#') then None else begin
(* eprintf "#Regex: %s\n" line; *)
let l = if anchor then "^" ^ line else line in
Some (regex_of_ascii_str ~dec l)
end
let join_regex e_rx = Union (List.of_enum e_rx)
let rx_of_dec_strings ?(anchor=false) ?allow_comments ?(max_regs=max_int) rxs =
Enum.filter_map (line_to_regex ~anchor ?allow_comments) rxs
|> Enum.take max_regs
|> join_regex