diff --git a/CHANGELOG.md b/CHANGELOG.md index a31b791e82..75fa9987f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ Changes can also be flagged with a GitHub label for tracking purposes. The URL o ### Added - Added Action Center MVP behind new feature flag [#5622](https://github.com/ethyca/fides/pull/5622) - Added cache-clearing methods to the `DBCache` model to allow deleting cache entries [#5629](https://github.com/ethyca/fides/pull/5629) +- Adds partitioning, custom identities, multiple identities to test coverage for BigQuery Enterprise [#5618](https://github.com/ethyca/fides/pull/5618) ### Changed - Updated brand link url [#5656](https://github.com/ethyca/fides/pull/5656) diff --git a/data/dataset/bigquery_enterprise_test_dataset.yml b/data/dataset/bigquery_enterprise_test_dataset.yml index 64668192d0..f2532009cb 100644 --- a/data/dataset/bigquery_enterprise_test_dataset.yml +++ b/data/dataset/bigquery_enterprise_test_dataset.yml @@ -112,6 +112,61 @@ dataset: data_categories: [user.contact] - name: view_count data_categories: [system.operations] + - name: stackoverflow_posts_partitioned + fields: + - name: accepted_answer_id + data_categories: [ system.operations ] + - name: answer_count + data_categories: [ system.operations ] + - name: body + data_categories: [ user.contact ] + - name: comment_count + data_categories: [ system.operations ] + - name: community_owned_date + data_categories: [ system.operations ] + - name: creation_date + data_categories: [ system.operations ] + - name: favorite_count + data_categories: [ system.operations ] + - name: id + data_categories: [ system.operations ] + fides_meta: + data_type: integer + - name: last_activity_date + data_categories: [ system.operations ] + - name: last_edit_date + data_categories: [ system.operations ] + - name: last_editor_display_name + data_categories: [ system.operations ] + - name: last_editor_user_id + data_categories: [ system.operations ] + fides_meta: + references: + - dataset: enterprise_dsr_testing + field: users.id + direction: from + - name: owner_display_name + data_categories: [ user.contact ] + - name: owner_user_id + data_categories: [ system.operations ] + fides_meta: + references: + - dataset: enterprise_dsr_testing + field: users.id + direction: from + data_type: integer + - name: parent_id + data_categories: [ system.operations ] + - name: post_type_id + data_categories: [ system.operations ] + - name: score + data_categories: [ system.operations ] + - name: tags + data_categories: [ system.operations ] + - name: title + data_categories: [ user.contact ] + - name: view_count + data_categories: [ system.operations] - name: users fields: - name: about_me diff --git a/tests/fixtures/bigquery_fixtures.py b/tests/fixtures/bigquery_fixtures.py index 105910e466..982e18a12b 100644 --- a/tests/fixtures/bigquery_fixtures.py +++ b/tests/fixtures/bigquery_fixtures.py @@ -202,6 +202,56 @@ def bigquery_enterprise_test_dataset_config( ctl_dataset.delete(db=db) +@pytest.fixture +def bigquery_enterprise_test_dataset_config_with_partitioning_meta( + bigquery_enterprise_connection_config: ConnectionConfig, + db: Session, + example_datasets: List[Dict], +) -> Generator: + bigquery_enterprise_dataset = example_datasets[16] + fides_key = bigquery_enterprise_dataset["fides_key"] + bigquery_enterprise_connection_config.name = fides_key + bigquery_enterprise_connection_config.key = fides_key + + # Update stackoverflow_posts_partitioned collection to have partition meta_data + # It is already set up as a partitioned table in BigQuery itself + stackoverflow_posts_partitioned_collection = next( + collection + for collection in bigquery_enterprise_dataset["collections"] + if collection["name"] == "stackoverflow_posts_partitioned" + ) + bigquery_enterprise_dataset["collections"].remove( + stackoverflow_posts_partitioned_collection + ) + stackoverflow_posts_partitioned_collection["fides_meta"] = { + "partitioning": { + "where_clauses": [ + "`creation_date` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `creation_date` <= CURRENT_TIMESTAMP()", + "`creation_date` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2000 DAY) AND `creation_date` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY)", + ] + } + } + bigquery_enterprise_dataset["collections"].append( + stackoverflow_posts_partitioned_collection + ) + + bigquery_enterprise_connection_config.save(db=db) + + ctl_dataset = CtlDataset.create_from_dataset_dict(db, bigquery_enterprise_dataset) + + dataset = DatasetConfig.create( + db=db, + data={ + "connection_config_id": bigquery_enterprise_connection_config.id, + "fides_key": fides_key, + "ctl_dataset_id": ctl_dataset.id, + }, + ) + yield dataset + dataset.delete(db=db) + ctl_dataset.delete(db=db) + + @pytest.fixture def bigquery_example_test_dataset_config_with_namespace_meta( bigquery_connection_config_without_default_dataset: ConnectionConfig, @@ -482,14 +532,14 @@ def bigquery_enterprise_resources( """ connection.execute(stmt) - # Create test stackoverflow_posts data. Posts are responses to questions on Stackoverflow, and does not include original question. + # Create test stackoverflow_posts_partitioned data. Posts are responses to questions on Stackoverflow, and does not include original question. post_body = "For me, the solution was to adopt 3 cats and dance with them under the full moon at midnight." - stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts;" + stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts_partitioned;" res = connection.execute(stmt) random_increment = random.randint(0, 99999) post_id = res.all()[0][0] + random_increment stmt = f""" - insert into enterprise_dsr_testing.stackoverflow_posts (body, creation_date, id, owner_user_id, owner_display_name) + insert into enterprise_dsr_testing.stackoverflow_posts_partitioned (body, creation_date, id, owner_user_id, owner_display_name) values ('{post_body}', '{creation_date}', {post_id}, {user_id}, '{display_name}'); """ connection.execute(stmt) @@ -539,7 +589,102 @@ def bigquery_enterprise_resources( stmt = f"delete from enterprise_dsr_testing.comments where id = {comment_id};" connection.execute(stmt) - stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts where id = {post_id};" + stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};" + connection.execute(stmt) + + stmt = f"delete from enterprise_dsr_testing.users where id = {user_id};" + connection.execute(stmt) + + +@pytest.fixture(scope="function") +def bigquery_enterprise_resources_with_partitioning( + bigquery_enterprise_test_dataset_config_with_partitioning_meta, +): + bigquery_connection_config = ( + bigquery_enterprise_test_dataset_config_with_partitioning_meta.connection_config + ) + connector = BigQueryConnector(bigquery_connection_config) + bigquery_client = connector.client() + with bigquery_client.connect() as connection: + + # Real max id in the Stackoverflow dataset is 20081052, so we purposefully generate and id above this max + stmt = "select max(id) from enterprise_dsr_testing.users;" + res = connection.execute(stmt) + # Increment the id by a random number to avoid conflicts on concurrent test runs + random_increment = random.randint(0, 99999) + user_id = res.all()[0][0] + random_increment + display_name = ( + f"fides_testing_{user_id}" # prefix to do manual cleanup if needed + ) + last_access_date = datetime.now() + creation_date = datetime.now() + location = "Dream World" + + # Create test user data + stmt = f""" + insert into enterprise_dsr_testing.users (id, display_name, last_access_date, creation_date, location) + values ({user_id}, '{display_name}', '{last_access_date}', '{creation_date}', '{location}'); + """ + connection.execute(stmt) + + # Create test stackoverflow_posts_partitioned data. Posts are responses to questions on Stackoverflow, and does not include original question. + post_body = "For me, the solution was to adopt 3 cats and dance with them under the full moon at midnight." + stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts_partitioned;" + res = connection.execute(stmt) + random_increment = random.randint(0, 99999) + post_id = res.all()[0][0] + random_increment + stmt = f""" + insert into enterprise_dsr_testing.stackoverflow_posts_partitioned (body, creation_date, id, owner_user_id, owner_display_name) + values ('{post_body}', '{creation_date}', {post_id}, {user_id}, '{display_name}'); + """ + connection.execute(stmt) + + # Create test comments data. Comments are responses to posts or questions on Stackoverflow, and does not include original question or post itself. + stmt = "select max(id) from enterprise_dsr_testing.comments;" + res = connection.execute(stmt) + random_increment = random.randint(0, 99999) + comment_id = res.all()[0][0] + random_increment + comment_text = "FYI this only works if you have pytest installed locally." + stmt = f""" + insert into enterprise_dsr_testing.comments (id, text, creation_date, post_id, user_id, user_display_name) + values ({comment_id}, '{comment_text}', '{creation_date}', {post_id}, {user_id}, '{display_name}'); + """ + connection.execute(stmt) + + # Create test post_history data + stmt = "select max(id) from enterprise_dsr_testing.comments;" + res = connection.execute(stmt) + random_increment = random.randint(0, 99999) + post_history_id = res.all()[0][0] + random_increment + revision_text = "this works if you have pytest" + uuid = str(uuid4()) + stmt = f""" + insert into enterprise_dsr_testing.post_history (id, text, creation_date, post_id, user_id, post_history_type_id, revision_guid) + values ({post_history_id}, '{revision_text}', '{creation_date}', {post_id}, {user_id}, 1, '{uuid}'); + """ + connection.execute(stmt) + + yield { + "name": display_name, + "user_id": user_id, + "comment_id": comment_id, + "post_history_id": post_history_id, + "post_id": post_id, + "client": bigquery_client, + "connector": connector, + "first_comment_text": comment_text, + "first_post_body": post_body, + "revision_text": revision_text, + "display_name": display_name, + } + # Remove test data and close BigQuery connection in teardown + stmt = f"delete from enterprise_dsr_testing.post_history where id = {post_history_id};" + connection.execute(stmt) + + stmt = f"delete from enterprise_dsr_testing.comments where id = {comment_id};" + connection.execute(stmt) + + stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};" connection.execute(stmt) stmt = f"delete from enterprise_dsr_testing.users where id = {user_id};" @@ -571,6 +716,30 @@ def bigquery_test_engine(bigquery_keyfile_creds) -> Generator: engine.dispose() +def seed_bigquery_enterprise_integration_db( + bigquery_enterprise_test_dataset_config, +) -> None: + """ + Currently unused. + This helper function has already been run once, and data has been populated in the test BigQuery enterprise dataset. + We may need this later in case tables are accidentally removed. + """ + bigquery_connection_config = ( + bigquery_enterprise_test_dataset_config.connection_config + ) + connector = BigQueryConnector(bigquery_connection_config) + bigquery_client = connector.client() + with bigquery_client.connect() as connection: + + stmt = f"CREATE TABLE enterprise_dsr_testing.stackoverflow_posts_partitioned partition by date(creation_date) as select * from enterprise_dsr_testing.stackoverflow_posts;" + connection.execute(stmt) + + print( + f"Created table enterprise_dsr_testing.stackoverflow_posts_partitioned, " + f"partitioned on column creation_date." + ) + + def seed_bigquery_integration_db(bigquery_integration_engine) -> None: """ Currently unused. diff --git a/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py b/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py index 5a133c031f..9042d4758a 100644 --- a/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py +++ b/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py @@ -10,29 +10,38 @@ PRIVACY_REQUEST_TASK_TIMEOUT = 5 # External services take much longer to return -PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL = 100 +PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL = 150 @pytest.mark.integration_bigquery @pytest.mark.integration_external @pytest.mark.parametrize( "dsr_version", - ["use_dsr_3_0", "use_dsr_2_0"], + ["use_dsr_2_0", "use_dsr_3_0"], +) +@pytest.mark.parametrize( + "bigquery_fixtures", + [ + "bigquery_enterprise_test_dataset_config", + ], ) @mock.patch("fides.api.models.privacy_request.PrivacyRequest.trigger_policy_webhook") -def test_create_and_process_access_request_bigquery_enterprise( +def test_access_request( trigger_webhook_mock, - bigquery_enterprise_test_dataset_config, db, cache, policy, dsr_version, + bigquery_fixtures, request, policy_pre_execution_webhooks, policy_post_execution_webhooks, run_privacy_request_task, ): request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + request.getfixturevalue( + bigquery_fixtures + ) # required to test partitioning and non-partitioned tables customer_email = "customer-1@example.com" user_id = ( @@ -89,7 +98,9 @@ def test_create_and_process_access_request_bigquery_enterprise( len( [ post["title"] - for post in results["enterprise_dsr_testing:stackoverflow_posts"] + for post in results[ + "enterprise_dsr_testing:stackoverflow_posts_partitioned" + ] ] ) == 30 @@ -132,17 +143,17 @@ def test_create_and_process_access_request_bigquery_enterprise( @pytest.mark.parametrize( "bigquery_fixtures", [ - "bigquery_enterprise_resources" - ], # todo- add other resources to test, e.g. partitioned data + "bigquery_enterprise_resources", + "bigquery_enterprise_resources_with_partitioning", + ], ) -def test_create_and_process_erasure_request_bigquery( +def test_erasure_request( db, request, policy, cache, dsr_version, bigquery_fixtures, - bigquery_enterprise_test_dataset_config, bigquery_enterprise_erasure_policy, run_privacy_request_task, ): @@ -204,7 +215,9 @@ def test_create_and_process_erasure_request_bigquery( len( [ post["title"] - for post in results["enterprise_dsr_testing:stackoverflow_posts"] + for post in results[ + "enterprise_dsr_testing:stackoverflow_posts_partitioned" + ] ] ) == 1 @@ -248,7 +261,259 @@ def test_create_and_process_erasure_request_bigquery( assert row.user_display_name is None assert row.text is None - stmt = f"select owner_user_id, owner_display_name, body from enterprise_dsr_testing.stackoverflow_posts where id = {post_id};" + stmt = f"select owner_user_id, owner_display_name, body from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};" + res = connection.execute(stmt).all() + for row in res: + assert ( + row.owner_user_id == bigquery_enterprise_resources["user_id"] + ) # not targeted by policy + assert row.owner_display_name is None + assert row.body is None + + stmt = f"select display_name, location from enterprise_dsr_testing.users where id = {user_id};" + res = connection.execute(stmt).all() + for row in res: + assert row.display_name is None + assert row.location is None + + +@pytest.mark.integration_bigquery +@pytest.mark.integration_external +@pytest.mark.parametrize( + "dsr_version", + ["use_dsr_3_0"], +) +@mock.patch("fides.api.models.privacy_request.PrivacyRequest.trigger_policy_webhook") +def test_access_request_multiple_custom_identities( + trigger_webhook_mock, + bigquery_enterprise_test_dataset_config, + db, + cache, + policy, + dsr_version, + request, + policy_pre_execution_webhooks, + policy_post_execution_webhooks, + run_privacy_request_task, +): + request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + + user_id = ( + 1754 # this is a real (not generated) user id in the Stackoverflow dataset + ) + data = { + "requested_at": "2024-08-30T16:09:37.359Z", + "policy_key": policy.key, + "identity": { + "loyalty_id": {"label": "Loyalty ID", "value": "CH-1"}, + "stackoverflow_user_id": { + "label": "Stackoverflow User Id", + "value": user_id, + }, + }, + } + + pr = get_privacy_request_results( + db, + policy, + run_privacy_request_task, + data, + PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL, + ) + + results = pr.get_raw_access_results() + assert len(results.keys()) == 4 + + for key in results.keys(): + assert results[key] is not None + assert results[key] != {} + + users = results["enterprise_dsr_testing:users"] + assert len(users) == 1 + user_details = users[0] + assert user_details["id"] == user_id + + assert ( + len( + [ + comment["user_id"] + for comment in results["enterprise_dsr_testing:comments"] + ] + ) + == 16 + ) + assert ( + len( + [post["user_id"] for post in results["enterprise_dsr_testing:post_history"]] + ) + == 39 + ) + assert ( + len( + [ + post["title"] + for post in results[ + "enterprise_dsr_testing:stackoverflow_posts_partitioned" + ] + ] + ) + == 30 + ) + + log_id = pr.execution_logs[0].id + pr_id = pr.id + + finished_audit_log: AuditLog = AuditLog.filter( + db=db, + conditions=( + (AuditLog.privacy_request_id == pr_id) + & (AuditLog.action == AuditLogAction.finished) + ), + ).first() + + assert finished_audit_log is not None + + # Both pre-execution webhooks and both post-execution webhooks were called + assert trigger_webhook_mock.call_count == 4 + + for webhook in policy_pre_execution_webhooks: + webhook.delete(db=db) + + for webhook in policy_post_execution_webhooks: + webhook.delete(db=db) + + policy.delete(db=db) + pr.delete(db=db) + assert not pr in db # Check that `pr` has been expunged from the session + assert ExecutionLog.get(db, object_id=log_id).privacy_request_id == pr_id + + +@pytest.mark.integration_external +@pytest.mark.integration_bigquery +@pytest.mark.parametrize( + "dsr_version", + ["use_dsr_3_0"], +) +@pytest.mark.parametrize( + "bigquery_fixtures", + [ + "bigquery_enterprise_resources", + "bigquery_enterprise_resources_with_partitioning", + ], +) +def test_erasure_request_multiple_custom_identities( + db, + request, + policy, + cache, + dsr_version, + bigquery_fixtures, + bigquery_enterprise_erasure_policy, + run_privacy_request_task, +): + request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + bigquery_enterprise_resources = request.getfixturevalue(bigquery_fixtures) + bigquery_client = bigquery_enterprise_resources["client"] + + # first test access request against manually added data + user_id = bigquery_enterprise_resources["user_id"] + data = { + "requested_at": "2024-08-30T16:09:37.359Z", + "policy_key": policy.key, + "identity": { + "loyalty_id": {"label": "Loyalty ID", "value": "CH-1"}, + "stackoverflow_user_id": { + "label": "Stackoverflow User Id", + "value": user_id, + }, + }, + } + + pr = get_privacy_request_results( + db, + policy, + run_privacy_request_task, + data, + PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL, + ) + + results = pr.get_raw_access_results() + assert len(results.keys()) == 4 + + for key in results.keys(): + assert results[key] is not None + assert results[key] != {} + + users = results["enterprise_dsr_testing:users"] + assert len(users) == 1 + user_details = users[0] + assert user_details["id"] == user_id + + assert ( + len( + [ + comment["user_id"] + for comment in results["enterprise_dsr_testing:comments"] + ] + ) + == 1 + ) + assert ( + len( + [post["user_id"] for post in results["enterprise_dsr_testing:post_history"]] + ) + == 1 + ) + assert ( + len( + [ + post["title"] + for post in results[ + "enterprise_dsr_testing:stackoverflow_posts_partitioned" + ] + ] + ) + == 1 + ) + + data = { + "requested_at": "2024-08-30T16:09:37.359Z", + "policy_key": bigquery_enterprise_erasure_policy.key, + "identity": { + "stackoverflow_user_id": { + "label": "Stackoverflow User Id", + "value": bigquery_enterprise_resources["user_id"], + }, + }, + } + + # Should erase all user data + pr = get_privacy_request_results( + db, + bigquery_enterprise_erasure_policy, + run_privacy_request_task, + data, + task_timeout=PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL, + ) + pr.delete(db=db) + + bigquery_client = bigquery_enterprise_resources["client"] + post_history_id = bigquery_enterprise_resources["post_history_id"] + comment_id = bigquery_enterprise_resources["comment_id"] + post_id = bigquery_enterprise_resources["post_id"] + with bigquery_client.connect() as connection: + stmt = f"select text from enterprise_dsr_testing.post_history where id = {post_history_id};" + res = connection.execute(stmt).all() + for row in res: + assert row.text is None + + stmt = f"select user_display_name, text from enterprise_dsr_testing.comments where id = {comment_id};" + res = connection.execute(stmt).all() + for row in res: + assert row.user_display_name is None + assert row.text is None + + stmt = f"select owner_user_id, owner_display_name, body from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};" res = connection.execute(stmt).all() for row in res: assert (