-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathburrows-wheeler_transform_fast.sf
160 lines (123 loc) · 3.8 KB
/
burrows-wheeler_transform_fast.sf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/ruby
# Author: Trizen
# Date: 14 June 2023
# Edit: 15 June 2023
# https://github.com/trizen
# Implementation of the Burrows–Wheeler transform, with fast inversion.
# https://rosettacode.org/wiki/Burrows–Wheeler_transform
# Reference:
# Data Compression (Summer 2023) - Lecture 12 - The Burrows-Wheeler Transform (BWT)
# https://youtube.com/watch?v=rQ7wwh4HRZM
define LOOKAHEAD_LEN = 128 # lower values are faster (on average)
func bwt_quadratic (s) { # O(n^2) space (impractical)
^s.len -> map {|i| [s.rotate(i), i] }.sort_by { .[0] }.map { .[1] }
}
func bwt_simple (s) { # O(n) space (very slow)
@(^s.len) -> sort {|a,b|
s.rotate(a) <=> s.rotate(b)
}
}
func bwt_cyclic (s) { # O(n) space (very slow)
var cyclic = s.chars
var len = cyclic.len
gather { # check if all the symbols are the same
take(true)
cyclic.each_cons(2, {|a,b|
if (a != b) {
take(false)
break
}
})
} == [true] && return @(^len)
@(^s.len) -> sort {|i,j|
while (cyclic[i] == cyclic[j]) {
i %= len if (++i >= len)
j %= len if (++j >= len)
}
cyclic[i] <=> cyclic[j]
}
}
func bwt_lookahead (s) { # O(n) space (moderately fast)
@(^s.len) -> sort {|a,b|
var t = s.slice(a, LOOKAHEAD_LEN)
var u = s.slice(b, LOOKAHEAD_LEN)
if (t.len < LOOKAHEAD_LEN) {
t += s.slice(0, min(a, LOOKAHEAD_LEN - t.len))
}
if (u.len < LOOKAHEAD_LEN) {
u += s.slice(0, min(b, LOOKAHEAD_LEN - u.len))
}
(t <=> u) || (s.rotate(a) <=> s.rotate(b))
}
}
func bwt_balanced (s) { # O(n * LOOKAHEAD_LEN) space (fast)
^s.len -> map {|i|
var t = s.slice(i, LOOKAHEAD_LEN)
if (t.len < LOOKAHEAD_LEN) {
t += s.slice(0, min(i, LOOKAHEAD_LEN - t.len))
}
[t, i]
}.sort {|a,b|
(a[0] <=> b[0]) || (s.rotate(a[1]) <=> s.rotate(b[1]))
}.map { .[1] }
}
func bwt_encode_quadratic(String s) { # O(n^2) space
#var bwt = s.len.of{|i| s.slice(i) + s.slice(0, i) }.sort
var bwt = s.len.of{|i| s.rotate(i) }.sort
var ret = bwt.map { .last }.join
var idx = bwt.index(s)
return (ret, idx)
}
func bwt_encode(String s) {
#var bwt = bwt_simple(s)
#var bwt = bwt_quadratic(s)
#var bwt = bwt_cyclic(s)
#var bwt = bwt_lookahead(s)
var bwt = bwt_balanced(s)
var ret = bwt.map {|i| s.slice(i-1, 1) }.join
var idx = bwt.first_index_by { .is_zero }
return (ret, idx)
}
func bwt_decode(String bwt, Number idx) { # fast inversion
var tail = bwt.chars
var head = tail.sort
var indices = []
tail.each_kv {|i,v|
indices[v.ord] := [] << i
}
var table = []
head.each_kv {|i,b|
table[i] = indices[b.ord].shift
}
var dec = ''
var i = idx
head.len.times {
dec += head[i]
i = table[i]
}
return dec
}
var tests = [
"banana", "appellee", "dogwood", "TOBEORNOTTOBEORTOBEORNOT"
"SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES", "PINEAPPLE",
"","a","aa","aabb","aaaaaaaaaaaa","aaaaaaaaaaaab",
"baaaaaaaaaaaa","aaaaaabaaaaaa","aaaaaaabaaaaa",
File(__FILE__).read(:raw),
File($^PERL).read(:raw),
#File($^SIDEF).read(:raw),
]
tests.each { |str|
var (enc, idx) = bwt_encode(str)
var dec = bwt_decode(enc, idx)
if (enc.len < 1024) {
say "BWT(#{dec.dump}) = (#{enc.dump}, #{idx})"
}
assert_eq(str, dec)
}
__END__
BWT("banana") = ("nnbaaa", 3)
BWT("appellee") = ("eelplepa", 0)
BWT("dogwood") = ("odoodwg", 1)
BWT("TOBEORNOTTOBEORTOBEORNOT") = ("OOOBBBRRTTTEEENNOOORTTOO", 20)
BWT("SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES") = ("TEXYDST.E.IXIXIXXSSMPPS.B..E.S.EUSFXDIIOIIIT", 29)
BWT("PINEAPPLE") = ("ENLPPIEPA", 6)