-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathboyer-moore_string_search_algorithm.sf
175 lines (137 loc) · 4.39 KB
/
boyer-moore_string_search_algorithm.sf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/ruby
#
## https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
#
# Translation of the Python example.
func match_length(Array S, idx1, idx2) {
if (idx1 == idx2) {
return (S.len - idx1)
}
var match_count = 0
while ((idx1 < S.len) && (idx2 < S.len) && (S[idx1] == S[idx2])) {
++match_count
++idx1
++idx2
}
return match_count
}
func fundamental_preprocess(Array S) {
if (S.len == 0) { # Handles case of empty string
return []
}
if (S.len == 1) { # Handles case of single-character string
return [1]
}
var z = S.len.of(0)
z[0] = S.len
z[1] = match_length(S, 0, 1)
for i in (2 .. z[1]) { # Optimization from exercise 1-5
z[i] = (z[1] - i + 1)
}
# Defines lower and upper limits of z-box
var l = 0
var r = 0
for i in (2+z[1] .. S.end) {
if (i <= r) { # i falls within existing z-box
var k = i-l
var b = z[k]
var a = (r - i + 1)
if (b < a) { # b ends within existing z-box
z[i] = b
}
else { # b ends at or after the end of the z-box, we need to do an explicit match to the right of the z-box
z[i] = a+match_length(S, a, r+1)
l = i
r = (i + z[i] - 1)
}
}
else { # i does not reside within existing z-box
z[i] = match_length(S, 0, i)
if (z[i] > 0) {
l = i
r = (i + z[i] - 1)
}
}
}
return z
}
func bad_character_table(Array S) {
if (S.len == 0) {
return 256.of { [] }
}
var R = 256.of { [-1] }
var alpha = 256.of(-1)
S.each_kv { |i,c|
alpha[c] = i
alpha.each_kv { |j, a|
R[j].append(a)
}
}
return R
}
func good_suffix_table(S) {
var L = S.len.of(-1)
var N = fundamental_preprocess(S.flip).flip
for j in (0 .. S.end) { # should the range be exclusive?
var i = (S.len - N[j])
if (i != S.len) {
L[i] = j
}
}
return L
}
func full_shift_table(S) {
var F = S.len.of(0)
var Z = fundamental_preprocess(S)
var longest = 0
Z.flip.each_kv { |i,zv|
longest = (zv == i+1 ? (zv `max` longest) : longest)
F[-i - 1] = longest
}
return F
}
func string_search(Array P, Array T) {
if ((P.len == 0) || (T.len == 0) || (T.len < P.len)) {
return []
}
var matches = []
# Preprocessing
var R = bad_character_table(P)
var L = good_suffix_table(P)
var F = full_shift_table(P)
var k = P.end # Represents alignment of end of P relative to T
var previous_k = -1 # Represents alignment in previous phase (Galil's rule)
while (k < T.len) {
var i = P.end # Character to compare in P
var h = k # Character to compare in T
while ((i >= 0) && (h > previous_k) && (P[i] == T[h])) { # Matches starting from end of P
--i
--h
}
if ((i == -1) || (h == previous_k)) { # Match has been found (Galil's rule)
matches.append(k - P.len + 1)
k += (P.len > 1 ? (P.len - F[1]) : 1)
}
else { # No match, shift by max of bad character and good suffix rules
var char_shift = (i - R[T[h]][i])
var suffix_shift = given (i+1) { |t|
case (t == P.len) { 1 } # Mismatch happened on first attempt
case (L[t] == -1) { P.len - F[t] } # Matched suffix does not appear anywhere in P
default { P.len - L[t] } # Matched suffix appears in P
}
var shift = (char_shift `max` suffix_shift)
previous_k = k if (shift >= i+1) # Galil's rule
k += shift
}
}
return matches
}
func string_search(String P, String T) {
string_search(P.bytes, T.bytes)
}
say(string_search("bar", "foobartestbarend")) #=> [3, 10]
say(string_search("bar", "foo-bar-test-bar-end")) #=> [4, 13]
say(string_search("Bar", "foo-bar-test-Bar-end-Bar")) #=> [13, 21]
say(string_search("-test-", "foo-bar-test-Bar-end-Bar")) #=> [7]
say(string_search("-Bar-end", "foo-Bar-test-Bar-end-Bar")) #=> [12]
say(string_search("-Bar", "foo-Bar-test-Bar-end-Bar")) #=> [3, 12, 20]