Skip to content

Commit

Permalink
LA-108: Expand on BQ Enterprise test coverage- partitioning, custom i…
Browse files Browse the repository at this point in the history
…dentities (#5618)
  • Loading branch information
eastandwestwind authored Jan 15, 2025
1 parent 4103da1 commit 31dcf58
Show file tree
Hide file tree
Showing 4 changed files with 505 additions and 15 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Changes can also be flagged with a GitHub label for tracking purposes. The URL o
### Added
- Added Action Center MVP behind new feature flag [#5622](https://github.com/ethyca/fides/pull/5622)
- Added cache-clearing methods to the `DBCache` model to allow deleting cache entries [#5629](https://github.com/ethyca/fides/pull/5629)
- Adds partitioning, custom identities, multiple identities to test coverage for BigQuery Enterprise [#5618](https://github.com/ethyca/fides/pull/5618)

### Changed
- Updated brand link url [#5656](https://github.com/ethyca/fides/pull/5656)
Expand Down
55 changes: 55 additions & 0 deletions data/dataset/bigquery_enterprise_test_dataset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,61 @@ dataset:
data_categories: [user.contact]
- name: view_count
data_categories: [system.operations]
- name: stackoverflow_posts_partitioned
fields:
- name: accepted_answer_id
data_categories: [ system.operations ]
- name: answer_count
data_categories: [ system.operations ]
- name: body
data_categories: [ user.contact ]
- name: comment_count
data_categories: [ system.operations ]
- name: community_owned_date
data_categories: [ system.operations ]
- name: creation_date
data_categories: [ system.operations ]
- name: favorite_count
data_categories: [ system.operations ]
- name: id
data_categories: [ system.operations ]
fides_meta:
data_type: integer
- name: last_activity_date
data_categories: [ system.operations ]
- name: last_edit_date
data_categories: [ system.operations ]
- name: last_editor_display_name
data_categories: [ system.operations ]
- name: last_editor_user_id
data_categories: [ system.operations ]
fides_meta:
references:
- dataset: enterprise_dsr_testing
field: users.id
direction: from
- name: owner_display_name
data_categories: [ user.contact ]
- name: owner_user_id
data_categories: [ system.operations ]
fides_meta:
references:
- dataset: enterprise_dsr_testing
field: users.id
direction: from
data_type: integer
- name: parent_id
data_categories: [ system.operations ]
- name: post_type_id
data_categories: [ system.operations ]
- name: score
data_categories: [ system.operations ]
- name: tags
data_categories: [ system.operations ]
- name: title
data_categories: [ user.contact ]
- name: view_count
data_categories: [ system.operations]
- name: users
fields:
- name: about_me
Expand Down
177 changes: 173 additions & 4 deletions tests/fixtures/bigquery_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,56 @@ def bigquery_enterprise_test_dataset_config(
ctl_dataset.delete(db=db)


@pytest.fixture
def bigquery_enterprise_test_dataset_config_with_partitioning_meta(
bigquery_enterprise_connection_config: ConnectionConfig,
db: Session,
example_datasets: List[Dict],
) -> Generator:
bigquery_enterprise_dataset = example_datasets[16]
fides_key = bigquery_enterprise_dataset["fides_key"]
bigquery_enterprise_connection_config.name = fides_key
bigquery_enterprise_connection_config.key = fides_key

# Update stackoverflow_posts_partitioned collection to have partition meta_data
# It is already set up as a partitioned table in BigQuery itself
stackoverflow_posts_partitioned_collection = next(
collection
for collection in bigquery_enterprise_dataset["collections"]
if collection["name"] == "stackoverflow_posts_partitioned"
)
bigquery_enterprise_dataset["collections"].remove(
stackoverflow_posts_partitioned_collection
)
stackoverflow_posts_partitioned_collection["fides_meta"] = {
"partitioning": {
"where_clauses": [
"`creation_date` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `creation_date` <= CURRENT_TIMESTAMP()",
"`creation_date` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2000 DAY) AND `creation_date` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY)",
]
}
}
bigquery_enterprise_dataset["collections"].append(
stackoverflow_posts_partitioned_collection
)

bigquery_enterprise_connection_config.save(db=db)

ctl_dataset = CtlDataset.create_from_dataset_dict(db, bigquery_enterprise_dataset)

dataset = DatasetConfig.create(
db=db,
data={
"connection_config_id": bigquery_enterprise_connection_config.id,
"fides_key": fides_key,
"ctl_dataset_id": ctl_dataset.id,
},
)
yield dataset
dataset.delete(db=db)
ctl_dataset.delete(db=db)


@pytest.fixture
def bigquery_example_test_dataset_config_with_namespace_meta(
bigquery_connection_config_without_default_dataset: ConnectionConfig,
Expand Down Expand Up @@ -482,14 +532,14 @@ def bigquery_enterprise_resources(
"""
connection.execute(stmt)

# Create test stackoverflow_posts data. Posts are responses to questions on Stackoverflow, and does not include original question.
# Create test stackoverflow_posts_partitioned data. Posts are responses to questions on Stackoverflow, and does not include original question.
post_body = "For me, the solution was to adopt 3 cats and dance with them under the full moon at midnight."
stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts;"
stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts_partitioned;"
res = connection.execute(stmt)
random_increment = random.randint(0, 99999)
post_id = res.all()[0][0] + random_increment
stmt = f"""
insert into enterprise_dsr_testing.stackoverflow_posts (body, creation_date, id, owner_user_id, owner_display_name)
insert into enterprise_dsr_testing.stackoverflow_posts_partitioned (body, creation_date, id, owner_user_id, owner_display_name)
values ('{post_body}', '{creation_date}', {post_id}, {user_id}, '{display_name}');
"""
connection.execute(stmt)
Expand Down Expand Up @@ -539,7 +589,102 @@ def bigquery_enterprise_resources(
stmt = f"delete from enterprise_dsr_testing.comments where id = {comment_id};"
connection.execute(stmt)

stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts where id = {post_id};"
stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};"
connection.execute(stmt)

stmt = f"delete from enterprise_dsr_testing.users where id = {user_id};"
connection.execute(stmt)


@pytest.fixture(scope="function")
def bigquery_enterprise_resources_with_partitioning(
bigquery_enterprise_test_dataset_config_with_partitioning_meta,
):
bigquery_connection_config = (
bigquery_enterprise_test_dataset_config_with_partitioning_meta.connection_config
)
connector = BigQueryConnector(bigquery_connection_config)
bigquery_client = connector.client()
with bigquery_client.connect() as connection:

# Real max id in the Stackoverflow dataset is 20081052, so we purposefully generate and id above this max
stmt = "select max(id) from enterprise_dsr_testing.users;"
res = connection.execute(stmt)
# Increment the id by a random number to avoid conflicts on concurrent test runs
random_increment = random.randint(0, 99999)
user_id = res.all()[0][0] + random_increment
display_name = (
f"fides_testing_{user_id}" # prefix to do manual cleanup if needed
)
last_access_date = datetime.now()
creation_date = datetime.now()
location = "Dream World"

# Create test user data
stmt = f"""
insert into enterprise_dsr_testing.users (id, display_name, last_access_date, creation_date, location)
values ({user_id}, '{display_name}', '{last_access_date}', '{creation_date}', '{location}');
"""
connection.execute(stmt)

# Create test stackoverflow_posts_partitioned data. Posts are responses to questions on Stackoverflow, and does not include original question.
post_body = "For me, the solution was to adopt 3 cats and dance with them under the full moon at midnight."
stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts_partitioned;"
res = connection.execute(stmt)
random_increment = random.randint(0, 99999)
post_id = res.all()[0][0] + random_increment
stmt = f"""
insert into enterprise_dsr_testing.stackoverflow_posts_partitioned (body, creation_date, id, owner_user_id, owner_display_name)
values ('{post_body}', '{creation_date}', {post_id}, {user_id}, '{display_name}');
"""
connection.execute(stmt)

# Create test comments data. Comments are responses to posts or questions on Stackoverflow, and does not include original question or post itself.
stmt = "select max(id) from enterprise_dsr_testing.comments;"
res = connection.execute(stmt)
random_increment = random.randint(0, 99999)
comment_id = res.all()[0][0] + random_increment
comment_text = "FYI this only works if you have pytest installed locally."
stmt = f"""
insert into enterprise_dsr_testing.comments (id, text, creation_date, post_id, user_id, user_display_name)
values ({comment_id}, '{comment_text}', '{creation_date}', {post_id}, {user_id}, '{display_name}');
"""
connection.execute(stmt)

# Create test post_history data
stmt = "select max(id) from enterprise_dsr_testing.comments;"
res = connection.execute(stmt)
random_increment = random.randint(0, 99999)
post_history_id = res.all()[0][0] + random_increment
revision_text = "this works if you have pytest"
uuid = str(uuid4())
stmt = f"""
insert into enterprise_dsr_testing.post_history (id, text, creation_date, post_id, user_id, post_history_type_id, revision_guid)
values ({post_history_id}, '{revision_text}', '{creation_date}', {post_id}, {user_id}, 1, '{uuid}');
"""
connection.execute(stmt)

yield {
"name": display_name,
"user_id": user_id,
"comment_id": comment_id,
"post_history_id": post_history_id,
"post_id": post_id,
"client": bigquery_client,
"connector": connector,
"first_comment_text": comment_text,
"first_post_body": post_body,
"revision_text": revision_text,
"display_name": display_name,
}
# Remove test data and close BigQuery connection in teardown
stmt = f"delete from enterprise_dsr_testing.post_history where id = {post_history_id};"
connection.execute(stmt)

stmt = f"delete from enterprise_dsr_testing.comments where id = {comment_id};"
connection.execute(stmt)

stmt = f"delete from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};"
connection.execute(stmt)

stmt = f"delete from enterprise_dsr_testing.users where id = {user_id};"
Expand Down Expand Up @@ -571,6 +716,30 @@ def bigquery_test_engine(bigquery_keyfile_creds) -> Generator:
engine.dispose()


def seed_bigquery_enterprise_integration_db(
bigquery_enterprise_test_dataset_config,
) -> None:
"""
Currently unused.
This helper function has already been run once, and data has been populated in the test BigQuery enterprise dataset.
We may need this later in case tables are accidentally removed.
"""
bigquery_connection_config = (
bigquery_enterprise_test_dataset_config.connection_config
)
connector = BigQueryConnector(bigquery_connection_config)
bigquery_client = connector.client()
with bigquery_client.connect() as connection:

stmt = f"CREATE TABLE enterprise_dsr_testing.stackoverflow_posts_partitioned partition by date(creation_date) as select * from enterprise_dsr_testing.stackoverflow_posts;"
connection.execute(stmt)

print(
f"Created table enterprise_dsr_testing.stackoverflow_posts_partitioned, "
f"partitioned on column creation_date."
)


def seed_bigquery_integration_db(bigquery_integration_engine) -> None:
"""
Currently unused.
Expand Down
Loading

0 comments on commit 31dcf58

Please sign in to comment.