|
| 1 | +-- test cases for collation support |
| 2 | + |
| 3 | +-- Create a test table with data |
| 4 | +create table t1(utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase) using parquet; |
| 5 | +insert into t1 values('aaa', 'aaa'); |
| 6 | +insert into t1 values('AAA', 'AAA'); |
| 7 | +insert into t1 values('bbb', 'bbb'); |
| 8 | +insert into t1 values('BBB', 'BBB'); |
| 9 | + |
| 10 | +-- describe |
| 11 | +describe table t1; |
| 12 | + |
| 13 | +-- group by and count utf8_binary |
| 14 | +select count(*) from t1 group by utf8_binary; |
| 15 | + |
| 16 | +-- group by and count utf8_lcase |
| 17 | +select count(*) from t1 group by utf8_lcase; |
| 18 | + |
| 19 | +-- filter equal utf8_binary |
| 20 | +select * from t1 where utf8_binary = 'aaa'; |
| 21 | + |
| 22 | +-- filter equal utf8_lcase |
| 23 | +select * from t1 where utf8_lcase = 'aaa' collate utf8_lcase; |
| 24 | + |
| 25 | +-- filter less then utf8_binary |
| 26 | +select * from t1 where utf8_binary < 'bbb'; |
| 27 | + |
| 28 | +-- filter less then utf8_lcase |
| 29 | +select * from t1 where utf8_lcase < 'bbb' collate utf8_lcase; |
| 30 | + |
| 31 | +-- inner join |
| 32 | +select l.utf8_binary, r.utf8_lcase from t1 l join t1 r on l.utf8_lcase = r.utf8_lcase; |
| 33 | + |
| 34 | +-- create second table for anti-join |
| 35 | +create table t2(utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase) using parquet; |
| 36 | +insert into t2 values('aaa', 'aaa'); |
| 37 | +insert into t2 values('bbb', 'bbb'); |
| 38 | + |
| 39 | +-- anti-join on lcase |
| 40 | +select * from t1 anti join t2 on t1.utf8_lcase = t2.utf8_lcase; |
| 41 | + |
| 42 | +drop table t2; |
| 43 | +drop table t1; |
| 44 | + |
| 45 | +-- set operations |
| 46 | +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_lcase from values ('aaa'), ('bbb'); |
| 47 | +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_lcase from values ('aaa'), ('bbb'); |
| 48 | +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_lcase from values ('aaa'), ('bbb'); |
| 49 | +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_lcase from values ('aaa'), ('bbb'); |
| 50 | +select col1 collate utf8_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_lcase from values ('aaa'), ('bbb'); |
| 51 | + |
| 52 | +-- set operations with conflicting collations |
| 53 | +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate unicode_ci from values ('aaa'), ('bbb'); |
| 54 | +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate unicode_ci from values ('aaa'), ('bbb'); |
| 55 | +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate unicode_ci from values ('aaa'), ('bbb'); |
| 56 | +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate unicode_ci from values ('aaa'), ('bbb'); |
| 57 | +select col1 collate utf8_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate unicode_ci from values ('aaa'), ('bbb'); |
| 58 | + |
| 59 | +-- create table with struct field |
| 60 | +create table t1 (c1 struct<utf8_binary: string collate utf8_binary, utf8_lcase: string collate utf8_lcase>) USING PARQUET; |
| 61 | + |
| 62 | +insert into t1 values (named_struct('utf8_binary', 'aaa', 'utf8_lcase', 'aaa')); |
| 63 | +insert into t1 values (named_struct('utf8_binary', 'AAA', 'utf8_lcase', 'AAA')); |
| 64 | + |
| 65 | +-- aggregate against nested field utf8_binary |
| 66 | +select count(*) from t1 group by c1.utf8_binary; |
| 67 | + |
| 68 | +-- aggregate against nested field utf8_lcase |
| 69 | +select count(*) from t1 group by c1.utf8_lcase; |
| 70 | + |
| 71 | +drop table t1; |
| 72 | + |
| 73 | +-- array function tests |
| 74 | +select array_contains(ARRAY('aaa' collate utf8_lcase),'AAA' collate utf8_lcase); |
| 75 | +select array_position(ARRAY('aaa' collate utf8_lcase, 'bbb' collate utf8_lcase),'BBB' collate utf8_lcase); |
| 76 | + |
| 77 | +-- utility |
| 78 | +select nullif('aaa' COLLATE utf8_lcase, 'AAA' COLLATE utf8_lcase); |
| 79 | +select least('aaa' COLLATE utf8_lcase, 'AAA' collate utf8_lcase, 'a' collate utf8_lcase); |
| 80 | + |
| 81 | +-- array operations |
| 82 | +select arrays_overlap(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)); |
| 83 | +select array_distinct(array('aaa' collate utf8_lcase, 'AAA' collate utf8_lcase)); |
| 84 | +select array_union(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)); |
| 85 | +select array_intersect(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)); |
| 86 | +select array_except(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)); |
| 87 | + |
| 88 | +-- ICU collations (all statements return true) |
| 89 | +select 'a' collate unicode < 'A'; |
| 90 | +select 'a' collate unicode_ci = 'A'; |
| 91 | +select 'a' collate unicode_ai = 'å'; |
| 92 | +select 'a' collate unicode_ci_ai = 'Å'; |
| 93 | +select 'a' collate en < 'A'; |
| 94 | +select 'a' collate en_ci = 'A'; |
| 95 | +select 'a' collate en_ai = 'å'; |
| 96 | +select 'a' collate en_ci_ai = 'Å'; |
| 97 | +select 'Kypper' collate sv < 'Köpfe'; |
| 98 | +select 'Kypper' collate de > 'Köpfe'; |
| 99 | +select 'I' collate tr_ci = 'ı'; |
| 100 | + |
| 101 | +-- create table for str_to_map |
| 102 | +create table t3 (text string collate utf8_binary, pairDelim string collate utf8_lcase, keyValueDelim string collate utf8_binary) using parquet; |
| 103 | + |
| 104 | +insert into t3 values('a:1,b:2,c:3', ',', ':'); |
| 105 | + |
| 106 | +select str_to_map(text, pairDelim, keyValueDelim) from t3; |
| 107 | +select str_to_map(text collate utf8_binary, pairDelim collate utf8_lcase, keyValueDelim collate utf8_binary) from t3; |
| 108 | +select str_to_map(text collate utf8_binary, pairDelim collate utf8_binary, keyValueDelim collate utf8_binary) from t3; |
| 109 | +select str_to_map(text collate unicode_ai, pairDelim collate unicode_ai, keyValueDelim collate unicode_ai) from t3; |
| 110 | + |
| 111 | +drop table t3; |
| 112 | + |
| 113 | +create table t1(s string, utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase) using parquet; |
| 114 | +insert into t1 values ('Spark', 'Spark', 'SQL'); |
| 115 | +insert into t1 values ('aaAaAAaA', 'aaAaAAaA', 'aaAaAAaA'); |
| 116 | +insert into t1 values ('aaAaAAaA', 'aaAaAAaA', 'aaAaaAaA'); |
| 117 | +insert into t1 values ('aaAaAAaA', 'aaAaAAaA', 'aaAaaAaAaaAaaAaAaaAaaAaA'); |
| 118 | +insert into t1 values ('bbAbaAbA', 'bbAbAAbA', 'a'); |
| 119 | +insert into t1 values ('İo', 'İo', 'İo'); |
| 120 | +insert into t1 values ('İo', 'İo', 'İo '); |
| 121 | +insert into t1 values ('İo', 'İo ', 'İo'); |
| 122 | +insert into t1 values ('İo', 'İo', 'i̇o'); |
| 123 | +insert into t1 values ('efd2', 'efd2', 'efd2'); |
| 124 | +insert into t1 values ('Hello, world! Nice day.', 'Hello, world! Nice day.', 'Hello, world! Nice day.'); |
| 125 | +insert into t1 values ('Something else. Nothing here.', 'Something else. Nothing here.', 'Something else. Nothing here.'); |
| 126 | +insert into t1 values ('kitten', 'kitten', 'sitTing'); |
| 127 | +insert into t1 values ('abc', 'abc', 'abc'); |
| 128 | +insert into t1 values ('abcdcba', 'abcdcba', 'aBcDCbA'); |
| 129 | + |
| 130 | +create table t2(ascii double) using parquet; |
| 131 | +insert into t2 values (97.52143); |
| 132 | +insert into t2 values (66.421); |
| 133 | + |
| 134 | +create table t3(utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase) using parquet; |
| 135 | +insert into t3 values ('aaAaAAaA', 'aaAaaAaA'); |
| 136 | +insert into t3 values ('efd2', 'efd2'); |
| 137 | + |
| 138 | +create table t4(num long) using parquet; |
| 139 | +insert into t4 values (97); |
| 140 | +insert into t4 values (66); |
| 141 | + |
| 142 | +-- Elt |
| 143 | +select elt(2, s, utf8_binary) from t1; |
| 144 | +select elt(2, utf8_binary, utf8_lcase, s) from t1; |
| 145 | +select elt(1, utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase) from t1; |
| 146 | +select elt(1, utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary) from t1; |
| 147 | +select elt(1, utf8_binary collate utf8_binary, utf8_lcase) from t1; |
| 148 | +select elt(1, utf8_binary, 'word'), elt(1, utf8_lcase, 'word') from t1; |
| 149 | +select elt(1, utf8_binary, 'word' collate utf8_lcase), elt(1, utf8_lcase, 'word' collate utf8_binary) from t1; |
| 150 | + |
| 151 | +-- Ascii & UnBase64 string expressions |
| 152 | +select ascii(utf8_binary), ascii(utf8_lcase) from t1; |
| 153 | +select ascii(utf8_binary collate utf8_lcase), ascii(utf8_lcase collate utf8_binary) from t1; |
| 154 | +select unbase64(utf8_binary), unbase64(utf8_lcase) from t3; |
| 155 | +select unbase64(utf8_binary collate utf8_lcase), unbase64(utf8_lcase collate utf8_binary) from t3; |
| 156 | + |
| 157 | +-- Base64, Decode |
| 158 | +select base64(utf8_binary), base64(utf8_lcase) from t1; |
| 159 | +select base64(utf8_binary collate utf8_lcase), base64(utf8_lcase collate utf8_binary) from t1; |
| 160 | +select decode(encode(utf8_binary, 'utf-8'), 'utf-8'), decode(encode(utf8_lcase, 'utf-8'), 'utf-8') from t1; |
| 161 | +select decode(encode(utf8_binary collate utf8_lcase, 'utf-8'), 'utf-8'), decode(encode(utf8_lcase collate utf8_binary, 'utf-8'), 'utf-8') from t1; |
| 162 | + |
| 163 | +-- FormatNumber |
| 164 | +select format_number(ascii, '###.###') from t2; |
| 165 | +select format_number(ascii, '###.###' collate utf8_lcase) from t2; |
| 166 | + |
| 167 | +-- Encode, ToBinary |
| 168 | +select encode(utf8_binary, 'utf-8'), encode(utf8_lcase, 'utf-8') from t1; |
| 169 | +select encode(utf8_binary collate utf8_lcase, 'utf-8'), encode(utf8_lcase collate utf8_binary, 'utf-8') from t1; |
| 170 | +select to_binary(utf8_binary, 'utf-8'), to_binary(utf8_lcase, 'utf-8') from t1; |
| 171 | +select to_binary(utf8_binary collate utf8_lcase, 'utf-8'), to_binary(utf8_lcase collate utf8_binary, 'utf-8') from t1; |
| 172 | + |
| 173 | +-- SoundEx |
| 174 | +select soundex(utf8_binary), soundex(utf8_lcase) from t1; |
| 175 | +select soundex(utf8_binary collate utf8_lcase), soundex(utf8_lcase collate utf8_binary) from t1; |
| 176 | + |
| 177 | +-- Luhncheck |
| 178 | +select luhn_check(num) from t4; |
| 179 | + |
| 180 | +-- Levenshtein |
| 181 | +select levenshtein(utf8_binary, utf8_lcase) from t1; |
| 182 | +select levenshtein(s, utf8_binary) from t1; |
| 183 | +select levenshtein(utf8_binary collate utf8_binary, s collate utf8_lcase) from t1; |
| 184 | +select levenshtein(utf8_binary, utf8_lcase collate utf8_binary) from t1; |
| 185 | +select levenshtein(utf8_binary collate utf8_lcase, utf8_lcase collate utf8_lcase) from t1; |
| 186 | +select levenshtein(utf8_binary, 'a'), levenshtein(utf8_lcase, 'a') from t1; |
| 187 | +select levenshtein(utf8_binary, 'AaAA' collate utf8_lcase, 3), levenshtein(utf8_lcase, 'AAa' collate utf8_binary, 4) from t1; |
| 188 | + |
| 189 | +-- IsValidUTF8 |
| 190 | +select is_valid_utf8(utf8_binary), is_valid_utf8(utf8_lcase) from t1; |
| 191 | +select is_valid_utf8(utf8_binary collate utf8_lcase), is_valid_utf8(utf8_lcase collate utf8_binary) from t1; |
| 192 | +select is_valid_utf8(utf8_binary collate utf8_lcase_rtrim), is_valid_utf8(utf8_lcase collate utf8_binary_rtrim) from t1; |
| 193 | + |
| 194 | +-- MakeValidUTF8 |
| 195 | +select make_valid_utf8(utf8_binary), make_valid_utf8(utf8_lcase) from t1; |
| 196 | +select make_valid_utf8(utf8_binary collate utf8_lcase), make_valid_utf8(utf8_lcase collate utf8_binary) from t1; |
| 197 | +select make_valid_utf8(utf8_binary collate utf8_lcase_rtrim), make_valid_utf8(utf8_lcase collate utf8_binary_rtrim) from t1; |
| 198 | + |
| 199 | +-- ValidateUTF8 |
| 200 | +select validate_utf8(utf8_binary), validate_utf8(utf8_lcase) from t1; |
| 201 | +select validate_utf8(utf8_binary collate utf8_lcase), validate_utf8(utf8_lcase collate utf8_binary) from t1; |
| 202 | +select validate_utf8(utf8_binary collate utf8_lcase_rtrim), validate_utf8(utf8_lcase collate utf8_binary_rtrim) from t1; |
| 203 | + |
| 204 | +-- TryValidateUTF8 |
| 205 | +select try_validate_utf8(utf8_binary), try_validate_utf8(utf8_lcase) from t1; |
| 206 | +select try_validate_utf8(utf8_binary collate utf8_lcase), try_validate_utf8(utf8_lcase collate utf8_binary) from t1; |
| 207 | +select try_validate_utf8(utf8_binary collate utf8_lcase_rtrim), try_validate_utf8(utf8_lcase collate utf8_binary_rtrim) from t1; |
| 208 | + |
| 209 | +drop table t1; |
| 210 | +drop table t2; |
| 211 | +drop table t3; |
| 212 | +drop table t4; |
0 commit comments