LA-108: Expand on BQ Enterprise test coverage- partitioning, custom i…

…dentities (#5618)
ethyca · Jan 15, 2025 · 31dcf58 · 31dcf58
1 parent 4103da1
commit 31dcf58
Show file tree

Hide file tree

Showing 4 changed files with 505 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,7 @@ Changes can also be flagged with a GitHub label for tracking purposes. The URL o
 ### Added
 - Added Action Center MVP behind new feature flag [#5622](https://github.com/ethyca/fides/pull/5622)
 - Added cache-clearing methods to the `DBCache` model to allow deleting cache entries [#5629](https://github.com/ethyca/fides/pull/5629)
+- Adds partitioning, custom identities, multiple identities to test coverage for BigQuery Enterprise [#5618](https://github.com/ethyca/fides/pull/5618)
 
 ### Changed
 - Updated brand link url [#5656](https://github.com/ethyca/fides/pull/5656)

diff --git a/data/dataset/bigquery_enterprise_test_dataset.yml b/data/dataset/bigquery_enterprise_test_dataset.yml
@@ -112,6 +112,61 @@ dataset:
            data_categories: [user.contact]
          - name: view_count
            data_categories: [system.operations]
+     - name: stackoverflow_posts_partitioned
+       fields:
+         - name: accepted_answer_id
+           data_categories: [ system.operations ]
+         - name: answer_count
+           data_categories: [ system.operations ]
+         - name: body
+           data_categories: [ user.contact ]
+         - name: comment_count
+           data_categories: [ system.operations ]
+         - name: community_owned_date
+           data_categories: [ system.operations ]
+         - name: creation_date
+           data_categories: [ system.operations ]
+         - name: favorite_count
+           data_categories: [ system.operations ]
+         - name: id
+           data_categories: [ system.operations ]
+           fides_meta:
+             data_type: integer
+         - name: last_activity_date
+           data_categories: [ system.operations ]
+         - name: last_edit_date
+           data_categories: [ system.operations ]
+         - name: last_editor_display_name
+           data_categories: [ system.operations ]
+         - name: last_editor_user_id
+           data_categories: [ system.operations ]
+           fides_meta:
+             references:
+               - dataset: enterprise_dsr_testing
+                 field: users.id
+                 direction: from
+         - name: owner_display_name
+           data_categories: [ user.contact ]
+         - name: owner_user_id
+           data_categories: [ system.operations ]
+           fides_meta:
+             references:
+               - dataset: enterprise_dsr_testing
+                 field: users.id
+                 direction: from
+             data_type: integer
+         - name: parent_id
+           data_categories: [ system.operations ]
+         - name: post_type_id
+           data_categories: [ system.operations ]
+         - name: score
+           data_categories: [ system.operations ]
+         - name: tags
+           data_categories: [ system.operations ]
+         - name: title
+           data_categories: [ user.contact ]
+         - name: view_count
+           data_categories: [ system.operations]
      - name: users
        fields:
          - name: about_me

diff --git a/tests/fixtures/bigquery_fixtures.py b/tests/fixtures/bigquery_fixtures.py
@@ -202,6 +202,56 @@ def bigquery_enterprise_test_dataset_config(
     ctl_dataset.delete(db=db)
 
 
+@pytest.fixture
+def bigquery_enterprise_test_dataset_config_with_partitioning_meta(
+    bigquery_enterprise_connection_config: ConnectionConfig,
+    db: Session,
+    example_datasets: List[Dict],
+) -> Generator:
+    bigquery_enterprise_dataset = example_datasets[16]
+    fides_key = bigquery_enterprise_dataset["fides_key"]
+    bigquery_enterprise_connection_config.name = fides_key
+    bigquery_enterprise_connection_config.key = fides_key
+
+    # Update stackoverflow_posts_partitioned collection to have partition meta_data
+    # It is already set up as a partitioned table in BigQuery itself
+    stackoverflow_posts_partitioned_collection = next(
+        collection
+        for collection in bigquery_enterprise_dataset["collections"]
+        if collection["name"] == "stackoverflow_posts_partitioned"
+    )
+    bigquery_enterprise_dataset["collections"].remove(
+        stackoverflow_posts_partitioned_collection
+    )
+    stackoverflow_posts_partitioned_collection["fides_meta"] = {
+        "partitioning": {
+            "where_clauses": [
+                "`creation_date` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `creation_date` <= CURRENT_TIMESTAMP()",
+                "`creation_date` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2000 DAY) AND `creation_date` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY)",
+            ]
+        }
+    }
+    bigquery_enterprise_dataset["collections"].append(
+        stackoverflow_posts_partitioned_collection
+    )
+
+    bigquery_enterprise_connection_config.save(db=db)
+
+    ctl_dataset = CtlDataset.create_from_dataset_dict(db, bigquery_enterprise_dataset)
+
+    dataset = DatasetConfig.create(
+        db=db,
+        data={
+            "connection_config_id": bigquery_enterprise_connection_config.id,
+            "fides_key": fides_key,
+            "ctl_dataset_id": ctl_dataset.id,
+        },
+    )
+    yield dataset
+    dataset.delete(db=db)
+    ctl_dataset.delete(db=db)
+
+
 @pytest.fixture
 def bigquery_example_test_dataset_config_with_namespace_meta(
     bigquery_connection_config_without_default_dataset: ConnectionConfig,
@@ -482,14 +532,14 @@ def bigquery_enterprise_resources(
         """
         connection.execute(stmt)
 
-        # Create test stackoverflow_posts data. Posts are responses to questions on Stackoverflow, and does not include original question.
+        # Create test stackoverflow_posts_partitioned data. Posts are responses to questions on Stackoverflow, and does not include original question.
         post_body = "For me, the solution was to adopt 3 cats and dance with them under the full moon at midnight."
-        stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts;"
+        stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts_partitioned;"
         res = connection.execute(stmt)
         random_increment = random.randint(0, 99999)
         post_id = res.all()[0][0] + random_increment
         stmt = f"""
-            insert into enterprise_dsr_testing.stackoverflow_posts (body, creation_date, id, owner_user_id, owner_display_name)
+            insert into enterprise_dsr_testing.stackoverflow_posts_partitioned (body, creation_date, id, owner_user_id, owner_display_name)
             values ('{post_body}', '{creation_date}', {post_id}, {user_id}, '{display_name}');
         """
         connection.execute(stmt)
@@ -539,7 +589,102 @@ def bigquery_enterprise_resources(
         stmt = f"delete from enterprise_dsr_testing.comments where id = {comment_id};"
         connection.execute(stmt)
 
-        stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts where id = {post_id};"
+        stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};"
+        connection.execute(stmt)
+
+        stmt = f"delete from enterprise_dsr_testing.users where id = {user_id};"
+        connection.execute(stmt)
+
+
+@pytest.fixture(scope="function")
+def bigquery_enterprise_resources_with_partitioning(
+    bigquery_enterprise_test_dataset_config_with_partitioning_meta,
+):
+    bigquery_connection_config = (
+        bigquery_enterprise_test_dataset_config_with_partitioning_meta.connection_config
+    )
+    connector = BigQueryConnector(bigquery_connection_config)
+    bigquery_client = connector.client()
+    with bigquery_client.connect() as connection:
+
+        # Real max id in the Stackoverflow dataset is 20081052, so we purposefully generate and id above this max
+        stmt = "select max(id) from enterprise_dsr_testing.users;"
+        res = connection.execute(stmt)
+        # Increment the id by a random number to avoid conflicts on concurrent test runs
+        random_increment = random.randint(0, 99999)
+        user_id = res.all()[0][0] + random_increment
+        display_name = (
+            f"fides_testing_{user_id}"  # prefix to do manual cleanup if needed
+        )
+        last_access_date = datetime.now()
+        creation_date = datetime.now()
+        location = "Dream World"
+
+        # Create test user data
+        stmt = f"""
+            insert into enterprise_dsr_testing.users (id, display_name, last_access_date, creation_date, location)
+            values ({user_id}, '{display_name}', '{last_access_date}', '{creation_date}', '{location}');
+        """
+        connection.execute(stmt)
+
+        # Create test stackoverflow_posts_partitioned data. Posts are responses to questions on Stackoverflow, and does not include original question.
+        post_body = "For me, the solution was to adopt 3 cats and dance with them under the full moon at midnight."
+        stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts_partitioned;"
+        res = connection.execute(stmt)
+        random_increment = random.randint(0, 99999)
+        post_id = res.all()[0][0] + random_increment
+        stmt = f"""
+            insert into enterprise_dsr_testing.stackoverflow_posts_partitioned (body, creation_date, id, owner_user_id, owner_display_name)
+            values ('{post_body}', '{creation_date}', {post_id}, {user_id}, '{display_name}');
+        """
+        connection.execute(stmt)
+
+        # Create test comments data. Comments are responses to posts or questions on Stackoverflow, and does not include original question or post itself.
+        stmt = "select max(id) from enterprise_dsr_testing.comments;"
+        res = connection.execute(stmt)
+        random_increment = random.randint(0, 99999)
+        comment_id = res.all()[0][0] + random_increment
+        comment_text = "FYI this only works if you have pytest installed locally."
+        stmt = f"""
+            insert into enterprise_dsr_testing.comments (id, text, creation_date, post_id, user_id, user_display_name)
+            values ({comment_id}, '{comment_text}', '{creation_date}', {post_id}, {user_id}, '{display_name}');
+        """
+        connection.execute(stmt)
+
+        # Create test post_history data
+        stmt = "select max(id) from enterprise_dsr_testing.comments;"
+        res = connection.execute(stmt)
+        random_increment = random.randint(0, 99999)
+        post_history_id = res.all()[0][0] + random_increment
+        revision_text = "this works if you have pytest"
+        uuid = str(uuid4())
+        stmt = f"""
+            insert into enterprise_dsr_testing.post_history (id, text, creation_date, post_id, user_id, post_history_type_id, revision_guid)
+            values ({post_history_id}, '{revision_text}', '{creation_date}', {post_id}, {user_id}, 1, '{uuid}');
+        """
+        connection.execute(stmt)
+
+        yield {
+            "name": display_name,
+            "user_id": user_id,
+            "comment_id": comment_id,
+            "post_history_id": post_history_id,
+            "post_id": post_id,
+            "client": bigquery_client,
+            "connector": connector,
+            "first_comment_text": comment_text,
+            "first_post_body": post_body,
+            "revision_text": revision_text,
+            "display_name": display_name,
+        }
+        # Remove test data and close BigQuery connection in teardown
+        stmt = f"delete from enterprise_dsr_testing.post_history where id = {post_history_id};"
+        connection.execute(stmt)
+
+        stmt = f"delete from enterprise_dsr_testing.comments where id = {comment_id};"
+        connection.execute(stmt)
+
+        stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};"
         connection.execute(stmt)
 
         stmt = f"delete from enterprise_dsr_testing.users where id = {user_id};"
@@ -571,6 +716,30 @@ def bigquery_test_engine(bigquery_keyfile_creds) -> Generator:
     engine.dispose()
 
 
+def seed_bigquery_enterprise_integration_db(
+    bigquery_enterprise_test_dataset_config,
+) -> None:
+    """
+    Currently unused.
+    This helper function has already been run once, and data has been populated in the test BigQuery enterprise dataset.
+    We may need this later in case tables are accidentally removed.
+    """
+    bigquery_connection_config = (
+        bigquery_enterprise_test_dataset_config.connection_config
+    )
+    connector = BigQueryConnector(bigquery_connection_config)
+    bigquery_client = connector.client()
+    with bigquery_client.connect() as connection:
+
+        stmt = f"CREATE TABLE enterprise_dsr_testing.stackoverflow_posts_partitioned partition by date(creation_date) as select * from enterprise_dsr_testing.stackoverflow_posts;"
+        connection.execute(stmt)
+
+    print(
+        f"Created table enterprise_dsr_testing.stackoverflow_posts_partitioned, "
+        f"partitioned on column creation_date."
+    )
+
+
 def seed_bigquery_integration_db(bigquery_integration_engine) -> None:
     """
     Currently unused.