diff --git a/.github/scripts/fixtures.sh b/.github/scripts/fixtures.sh index 428b29d1..0aa61e6b 100755 --- a/.github/scripts/fixtures.sh +++ b/.github/scripts/fixtures.sh @@ -3,5 +3,13 @@ set -e hadoop fs -mkdir -p "/_test" hadoop fs -chmod 777 "/_test" +if [ "$TRANSPARENT_ENCRYPTION" = "true" ]; then + echo "Prepare encrypted zone" + hadoop fs -mkdir /_test/kms + hadoop fs -chmod 777 "/_test/kms" + hadoop key create key1 + hdfs crypto -createZone -keyName key1 -path /_test/kms +fi + hadoop fs -put ./testdata/foo.txt "/_test/foo.txt" hadoop fs -Ddfs.block.size=1048576 -put ./testdata/mobydick.txt "/_test/mobydick.txt" diff --git a/.github/scripts/install-hdfs.sh b/.github/scripts/install-hdfs.sh index 77d8803e..9b3af94d 100755 --- a/.github/scripts/install-hdfs.sh +++ b/.github/scripts/install-hdfs.sh @@ -2,7 +2,7 @@ set -e -KERBEROS=${KERBEROS-"false"} +KERBEROS="${KERBEROS-false}" AES=${AES-"false"} if [ "$DATA_TRANSFER_PROTECTION" = "privacy" ]; then KERBEROS="true" @@ -15,11 +15,18 @@ else ENCRYPT_DATA_TRANSFER="false" fi +CONF_KMS_PROVIDER="" +TRANSPARENT_ENCRYPTION=false +if [ "$HADOOP_VERSION" != "2.10.1" ]; then + TRANSPARENT_ENCRYPTION=true + CONF_KMS_PROVIDER="kms://http@localhost:9600/kms" +fi + CONF_AUTHENTICATION="simple" KERBEROS_REALM="EXAMPLE.COM" KERBEROS_PRINCIPLE="administrator" KERBEROS_PASSWORD="password1234" -if [ $KERBEROS = "true" ]; then +if [ "$KERBEROS" = "true" ]; then CONF_AUTHENTICATION="kerberos" HOSTNAME=$(hostname) @@ -50,13 +57,16 @@ EOF sudo apt-get install -y krb5-user krb5-kdc krb5-admin-server printf "$KERBEROS_PASSWORD\n$KERBEROS_PASSWORD" | sudo kdb5_util -r "$KERBEROS_REALM" create -s - for p in nn dn $USER gohdfs1 gohdfs2; do + for p in nn dn kms $USER gohdfs1 gohdfs2; do sudo kadmin.local -q "addprinc -randkey $p/$HOSTNAME@$KERBEROS_REALM" sudo kadmin.local -q "addprinc -randkey $p/localhost@$KERBEROS_REALM" sudo kadmin.local -q "xst -k /tmp/$p.keytab $p/$HOSTNAME@$KERBEROS_REALM" sudo kadmin.local -q "xst -k /tmp/$p.keytab $p/localhost@$KERBEROS_REALM" sudo chmod +rx /tmp/$p.keytab done + # HTTP service for KMS + sudo kadmin.local -q "addprinc -randkey HTTP/localhost@$KERBEROS_REALM" + sudo kadmin.local -q "xst -k /tmp/kms.keytab HTTP/localhost@$KERBEROS_REALM" echo "Restarting krb services..." sudo service krb5-kdc restart @@ -116,6 +126,10 @@ sudo tee $HADOOP_ROOT/etc/hadoop/core-site.xml <hadoop.rpc.protection $RPC_PROTECTION + + hadoop.security.key.provider.path + $CONF_KMS_PROVIDER + EOF @@ -125,6 +139,10 @@ sudo tee $HADOOP_ROOT/etc/hadoop/hdfs-site.xml <dfs.namenode.name.dir /tmp/hdfs/name + + dfs.namenode.fs-limits.min-block-size + 131072 + dfs.datanode.data.dir /tmp/hdfs/data @@ -172,6 +190,41 @@ $HADOOP_ROOT/bin/hdfs namenode -format sudo groupadd hadoop sudo usermod -a -G hadoop $USER +sudo tee $HADOOP_ROOT/etc/hadoop/kms-site.xml < + + hadoop.kms.key.provider.uri + jceks://file@/tmp/hdfs/kms.keystore + + + hadoop.security.keystore.java-keystore-provider.password-file + kms.keystore.password + + + hadoop.kms.authentication.type + $CONF_AUTHENTICATION + + + hadoop.kms.authentication.kerberos.keytab + /tmp/kms.keytab + + + hadoop.kms.authentication.kerberos.principal + HTTP/localhost@$KERBEROS_REALM + + +EOF + +sudo tee $HADOOP_ROOT/etc/hadoop/kms.keystore.password < /tmp/hdfs/kms.log 2>&1 & +fi + echo "Starting namenode..." $HADOOP_ROOT/bin/hdfs namenode > /tmp/hdfs/namenode.log 2>&1 & @@ -184,4 +237,5 @@ echo "Waiting for cluster to exit safe mode..." $HADOOP_ROOT/bin/hdfs dfsadmin -safemode wait echo "HADOOP_CONF_DIR=$(pwd)/$HADOOP_ROOT/etc/hadoop" >> $GITHUB_ENV -echo "$(pwd)/$HADOOP_ROOT/bin" >> $GITHUB_PATH \ No newline at end of file +echo "TRANSPARENT_ENCRYPTION=$TRANSPARENT_ENCRYPTION" >> $GITHUB_ENV +echo "$(pwd)/$HADOOP_ROOT/bin" >> $GITHUB_PATH diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7ff0ed63..f0755291 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,8 +38,8 @@ jobs: go-version: 1.17 # This step installs downloads hadoop and starts a local cluster with one - # namenode and one datanode. It adds the hadoop binaries to GITHUB_PATH - # and HADOOP_CONF_DIR to GITHUB_ENV. + # namenode and one datanode. It adds the hadoop binaries to GITHUB_PATH, + # TRANSPARENT_ENCRYPTION and HADOOP_CONF_DIR to GITHUB_ENV. - name: install-hdfs.sh run: ./.github/scripts/install-hdfs.sh env: @@ -65,6 +65,16 @@ jobs: run: | make test + - name: cat kms.log + if: always() + run: | + if [ -f /tmp/hdfs/kms.log ] + then + cat /tmp/hdfs/kms.log + else + echo "not exists" + fi + - name: cat namenode.log if: always() run: cat /tmp/hdfs/namenode.log diff --git a/cmd/hdfs/test/helper.bash b/cmd/hdfs/test/helper.bash index 37a45241..d8d0552e 100644 --- a/cmd/hdfs/test/helper.bash +++ b/cmd/hdfs/test/helper.bash @@ -1,6 +1,7 @@ #!/bin/bash export HADOOP_FS=${HADOOP_FS-"hadoop fs"} +export HADOOP_KEY=${HADOOP_KEY-"hadoop key"} export ROOT_TEST_DIR="$BATS_TEST_DIRNAME/../../.." export HDFS="$ROOT_TEST_DIR/hdfs" diff --git a/cmd/hdfs/test/te.bats b/cmd/hdfs/test/te.bats new file mode 100644 index 00000000..74dffa46 --- /dev/null +++ b/cmd/hdfs/test/te.bats @@ -0,0 +1,66 @@ +#!/usr/bin/env bats + +load helper + +@test "te: upload via native client, ensure we can download" { + if [ "$TRANSPARENT_ENCRYPTION" = "true" ]; then + run $HADOOP_FS -put $ROOT_TEST_DIR/testdata/foo.txt /_test/kms/foo1 + assert_success + run $HDFS cat /_test/kms/foo1 + assert_output "bar" + else + skip + fi +} + +@test "te: ensure native client can download once we uploaded to encrypted zone" { + if [ "$TRANSPARENT_ENCRYPTION" = "true" ]; then + run $HDFS put $ROOT_TEST_DIR/testdata/foo.txt /_test/kms/foo2 + assert_success + run $HADOOP_FS -cat /_test/kms/foo2 + assert_output "bar" + else + skip + fi +} + +@test "te: tail" { + if [ "$TRANSPARENT_ENCRYPTION" = "true" ]; then + run $HDFS put $ROOT_TEST_DIR/testdata/mobydick.txt /_test/kms/ + assert_success + run bash -c "$HDFS tail /_test/kms/mobydick.txt > $BATS_TMPDIR/mobydick_test.txt" + assert_success + SHA=`tail $ROOT_TEST_DIR/testdata/mobydick.txt | shasum | awk '{ print $1 }'` + assert_equal $SHA `shasum < $BATS_TMPDIR/mobydick_test.txt | awk '{ print $1 }'` + else + skip + fi +} + +@test "te: key not available" { + if [ "$TRANSPARENT_ENCRYPTION" = "true" ]; then + run $HADOOP_FS -mkdir -p /_test/kms-no-key + assert_success + run $HADOOP_KEY create key-removed + assert_success + run hdfs crypto -createZone -keyName key-removed -path /_test/kms-no-key + assert_success + run $HADOOP_FS -put $ROOT_TEST_DIR/testdata/foo.txt /_test/kms-no-key/foo + assert_success + run $HADOOP_KEY delete key-removed -f + assert_success + run $HDFS cat /_test/kms-no-key/foo + assert_failure + assert_output "open /_test/kms-no-key/foo: kms: 'key-removed@0' not found" + + run $HDFS put $ROOT_TEST_DIR/testdata/foo.txt /_test/kms-no-key/foo2 + assert_failure + assert_output "create /_test/kms-no-key/foo2: kms: 'key-removed@0' not found" + + run $HDFS ls /_test/kms-no-key/foo2 + assert_failure + assert_output "stat /_test/kms-no-key/foo2: file does not exist" + else + skip + fi +} diff --git a/file_writer_test.go b/file_writer_test.go index 38eb8a6a..6d30d9cb 100644 --- a/file_writer_test.go +++ b/file_writer_test.go @@ -531,3 +531,83 @@ func TestFileAppendDeadlineBefore(t *testing.T) { _, err = writer.Write([]byte("foo\n")) assert.Error(t, err) } + +func skipWithoutEncryptedZone(t *testing.T) { + if os.Getenv("TRANSPARENT_ENCRYPTION") != "true" { + t.Skip("Skipping, this test requires encryption zone to make sense") + } +} + +func TestEncryptedZoneWriteChunks(t *testing.T) { + skipWithoutEncryptedZone(t) + + originalText := []byte("some random plain text, nice to have it quite long") + client := getClient(t) + writer, err := client.Create("/_test/kms/write_chunks.txt") + require.NoError(t, err) + + var pos int64 = 0 + for _, x := range []int{5, 7, 6, 4, 28} { + _, err = writer.Write(originalText[pos : pos+int64(x)]) + require.NoError(t, err) + pos += int64(x) + } + assertClose(t, writer) + + reader, err := client.Open("/_test/kms/write_chunks.txt") + require.NoError(t, err) + + bytes, err := ioutil.ReadAll(reader) + require.NoError(t, err) + assert.Equal(t, originalText, bytes) +} + +func TestEncryptedZoneAppendChunks(t *testing.T) { + skipWithoutEncryptedZone(t) + + originalText := []byte("some random plain text, nice to have it quite long") + client := getClient(t) + writer, err := client.Create("/_test/kms/append_chunks.txt") + require.NoError(t, err) + assertClose(t, writer) + + var pos int64 = 0 + for _, x := range []int{5, 7, 6, 4, 28} { + writer, err := client.Append("/_test/kms/append_chunks.txt") + require.NoError(t, err) + _, err = writer.Write(originalText[pos : pos+int64(x)]) + require.NoError(t, err) + pos += int64(x) + assertClose(t, writer) + } + + reader, err := client.Open("/_test/kms/append_chunks.txt") + require.NoError(t, err) + bytes, err := ioutil.ReadAll(reader) + require.NoError(t, err) + assert.Equal(t, originalText, bytes) +} + +func TestEncryptedZoneLargeBlock(t *testing.T) { + skipWithoutEncryptedZone(t) + + // Generate quite large (aesChunkSize * 1.5 bytes) block, so we can trigger encryption in chunks. + str := "some random text" + originalText := []byte(strings.Repeat(str, aesChunkSize*1.5/len(str))) + client := getClient(t) + + // Create file with small (128Kb) block size, so encrypted chunk will be placed over multiple hdfs blocks. + writer, err := client.CreateFile("/_test/kms/large_write.txt", 1, 131072, 0755) + require.NoError(t, err) + + _, err = writer.Write(originalText) + require.NoError(t, err) + assertClose(t, writer) + + reader, err := client.Open("/_test/kms/large_write.txt") + require.NoError(t, err) + + bytes, err := ioutil.ReadAll(reader) + require.NoError(t, err) + assert.Equal(t, originalText, bytes) +}