Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LA-108: Expand on BQ Enterprise test coverage- partitioning, custom identities #5618

Merged
merged 20 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Changes can also be flagged with a GitHub label for tracking purposes. The URL o
### Added
- Added Action Center MVP behind new feature flag [#5622](https://github.com/ethyca/fides/pull/5622)
- Added cache-clearing methods to the `DBCache` model to allow deleting cache entries [#5629](https://github.com/ethyca/fides/pull/5629)
- Adds partitioning, custom identities, multiple identities to test coverage for BigQuery Enterprise [#5618](https://github.com/ethyca/fides/pull/5618)

### Changed
- Updated brand link url [#5656](https://github.com/ethyca/fides/pull/5656)
Expand Down
2 changes: 1 addition & 1 deletion data/dataset/bigquery_enterprise_test_dataset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ dataset:
custom_request_field: null
fields: null
fides_meta: null
- name: stackoverflow_posts
- name: stackoverflow_posts_partitioned
description: null
data_categories: null
fields:
Expand Down
177 changes: 173 additions & 4 deletions tests/fixtures/bigquery_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,56 @@ def bigquery_enterprise_test_dataset_config(
ctl_dataset.delete(db=db)


@pytest.fixture
def bigquery_enterprise_test_dataset_config_with_partitioning_meta(
bigquery_enterprise_connection_config: ConnectionConfig,
db: Session,
example_datasets: List[Dict],
) -> Generator:
bigquery_enterprise_dataset = example_datasets[16]
fides_key = bigquery_enterprise_dataset["fides_key"]
bigquery_enterprise_connection_config.name = fides_key
bigquery_enterprise_connection_config.key = fides_key

# Update stackoverflow_posts_partitioned collection to have partition meta_data
# It is already set up as a partitioned table in BigQuery itself
stackoverflow_posts_partitioned_collection = next(
collection
for collection in bigquery_enterprise_dataset["collections"]
if collection["name"] == "stackoverflow_posts_partitioned"
)
bigquery_enterprise_dataset["collections"].remove(
stackoverflow_posts_partitioned_collection
)
stackoverflow_posts_partitioned_collection["fides_meta"] = {
"partitioning": {
"where_clauses": [
"`creation_date` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `creation_date` <= CURRENT_TIMESTAMP()",
"`creation_date` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2000 DAY) AND `creation_date` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY)",
]
}
}
bigquery_enterprise_dataset["collections"].append(
stackoverflow_posts_partitioned_collection
)

bigquery_enterprise_connection_config.save(db=db)

ctl_dataset = CtlDataset.create_from_dataset_dict(db, bigquery_enterprise_dataset)

dataset = DatasetConfig.create(
db=db,
data={
"connection_config_id": bigquery_enterprise_connection_config.id,
"fides_key": fides_key,
"ctl_dataset_id": ctl_dataset.id,
},
)
yield dataset
dataset.delete(db=db)
ctl_dataset.delete(db=db)


@pytest.fixture
def bigquery_example_test_dataset_config_with_namespace_meta(
bigquery_connection_config_without_default_dataset: ConnectionConfig,
Expand Down Expand Up @@ -482,14 +532,14 @@ def bigquery_enterprise_resources(
"""
connection.execute(stmt)

# Create test stackoverflow_posts data. Posts are responses to questions on Stackoverflow, and does not include original question.
# Create test stackoverflow_posts_partitioned data. Posts are responses to questions on Stackoverflow, and does not include original question.
post_body = "For me, the solution was to adopt 3 cats and dance with them under the full moon at midnight."
stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts;"
stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts_partitioned;"
res = connection.execute(stmt)
random_increment = random.randint(0, 99999)
post_id = res.all()[0][0] + random_increment
stmt = f"""
insert into enterprise_dsr_testing.stackoverflow_posts (body, creation_date, id, owner_user_id, owner_display_name)
insert into enterprise_dsr_testing.stackoverflow_posts_partitioned (body, creation_date, id, owner_user_id, owner_display_name)
values ('{post_body}', '{creation_date}', {post_id}, {user_id}, '{display_name}');
"""
connection.execute(stmt)
Expand Down Expand Up @@ -539,7 +589,102 @@ def bigquery_enterprise_resources(
stmt = f"delete from enterprise_dsr_testing.comments where id = {comment_id};"
connection.execute(stmt)

stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts where id = {post_id};"
stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};"
connection.execute(stmt)

stmt = f"delete from enterprise_dsr_testing.users where id = {user_id};"
connection.execute(stmt)


@pytest.fixture(scope="function")
def bigquery_enterprise_resources_with_partitioning(
bigquery_enterprise_test_dataset_config_with_partitioning_meta,
):
bigquery_connection_config = (
bigquery_enterprise_test_dataset_config_with_partitioning_meta.connection_config
)
connector = BigQueryConnector(bigquery_connection_config)
bigquery_client = connector.client()
with bigquery_client.connect() as connection:

# Real max id in the Stackoverflow dataset is 20081052, so we purposefully generate and id above this max
stmt = "select max(id) from enterprise_dsr_testing.users;"
res = connection.execute(stmt)
# Increment the id by a random number to avoid conflicts on concurrent test runs
random_increment = random.randint(0, 99999)
user_id = res.all()[0][0] + random_increment
display_name = (
f"fides_testing_{user_id}" # prefix to do manual cleanup if needed
)
last_access_date = datetime.now()
creation_date = datetime.now()
location = "Dream World"

# Create test user data
stmt = f"""
insert into enterprise_dsr_testing.users (id, display_name, last_access_date, creation_date, location)
values ({user_id}, '{display_name}', '{last_access_date}', '{creation_date}', '{location}');
"""
connection.execute(stmt)

# Create test stackoverflow_posts_partitioned data. Posts are responses to questions on Stackoverflow, and does not include original question.
post_body = "For me, the solution was to adopt 3 cats and dance with them under the full moon at midnight."
stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts_partitioned;"
res = connection.execute(stmt)
random_increment = random.randint(0, 99999)
post_id = res.all()[0][0] + random_increment
stmt = f"""
insert into enterprise_dsr_testing.stackoverflow_posts_partitioned (body, creation_date, id, owner_user_id, owner_display_name)
values ('{post_body}', '{creation_date}', {post_id}, {user_id}, '{display_name}');
"""
connection.execute(stmt)

# Create test comments data. Comments are responses to posts or questions on Stackoverflow, and does not include original question or post itself.
stmt = "select max(id) from enterprise_dsr_testing.comments;"
res = connection.execute(stmt)
random_increment = random.randint(0, 99999)
comment_id = res.all()[0][0] + random_increment
comment_text = "FYI this only works if you have pytest installed locally."
stmt = f"""
insert into enterprise_dsr_testing.comments (id, text, creation_date, post_id, user_id, user_display_name)
values ({comment_id}, '{comment_text}', '{creation_date}', {post_id}, {user_id}, '{display_name}');
"""
connection.execute(stmt)

# Create test post_history data
stmt = "select max(id) from enterprise_dsr_testing.comments;"
res = connection.execute(stmt)
random_increment = random.randint(0, 99999)
post_history_id = res.all()[0][0] + random_increment
revision_text = "this works if you have pytest"
uuid = str(uuid4())
stmt = f"""
insert into enterprise_dsr_testing.post_history (id, text, creation_date, post_id, user_id, post_history_type_id, revision_guid)
values ({post_history_id}, '{revision_text}', '{creation_date}', {post_id}, {user_id}, 1, '{uuid}');
"""
connection.execute(stmt)

yield {
"name": display_name,
"user_id": user_id,
"comment_id": comment_id,
"post_history_id": post_history_id,
"post_id": post_id,
"client": bigquery_client,
"connector": connector,
"first_comment_text": comment_text,
"first_post_body": post_body,
"revision_text": revision_text,
"display_name": display_name,
}
# Remove test data and close BigQuery connection in teardown
stmt = f"delete from enterprise_dsr_testing.post_history where id = {post_history_id};"
connection.execute(stmt)

stmt = f"delete from enterprise_dsr_testing.comments where id = {comment_id};"
connection.execute(stmt)

stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};"
connection.execute(stmt)

stmt = f"delete from enterprise_dsr_testing.users where id = {user_id};"
Expand Down Expand Up @@ -571,6 +716,30 @@ def bigquery_test_engine(bigquery_keyfile_creds) -> Generator:
engine.dispose()


def seed_bigquery_enterprise_integration_db(
bigquery_enterprise_test_dataset_config,
) -> None:
"""
Currently unused.
This helper function has already been run once, and data has been populated in the test BigQuery enterprise dataset.
We may need this later in case tables are accidentally removed.
"""
bigquery_connection_config = (
bigquery_enterprise_test_dataset_config.connection_config
)
connector = BigQueryConnector(bigquery_connection_config)
bigquery_client = connector.client()
with bigquery_client.connect() as connection:

stmt = f"CREATE TABLE enterprise_dsr_testing.stackoverflow_posts_partitioned partition by date(creation_date) as select * from enterprise_dsr_testing.stackoverflow_posts;"
connection.execute(stmt)

print(
f"Created table enterprise_dsr_testing.stackoverflow_posts_partitioned, "
f"partitioned on column creation_date."
)


def seed_bigquery_integration_db(bigquery_integration_engine) -> None:
"""
Currently unused.
Expand Down
Loading
Loading