Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions backend/btrixcloud/file_uploads.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class FileUploadOps:
def __init__(self, mdb, org_ops, storage_ops):
self.files = mdb["file_uploads"]
self.crawl_configs = mdb["crawl_configs"]
self.crawls = mdb["crawls"]

self.org_ops = org_ops
self.storage_ops = storage_ops
Expand Down Expand Up @@ -325,6 +326,10 @@ async def delete_seed_file(
if matching_workflow:
raise HTTPException(status_code=400, detail="seed_file_in_use")

matching_crawl = await self.crawls.find_one({"config.seedFileId": file_id})
if matching_crawl:
raise HTTPException(status_code=400, detail="seed_file_in_use")

await self.storage_ops.delete_file_object(org, file)
await self.files.delete_one({"_id": file_id, "oid": org.id})
if file.type == "seedFile":
Expand Down Expand Up @@ -368,6 +373,12 @@ async def cleanup_unused_seed_files(self):
if first_matching_workflow:
continue

first_matching_crawl = await self.crawls.find_one(
{"config.seedFileId": file_id}
)
if first_matching_crawl:
continue

try:
org = await self.org_ops.get_org_by_id(file_dict["oid"])
await self.delete_seed_file(file_id, org)
Expand Down
4 changes: 1 addition & 3 deletions backend/test/test_crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
_coll_id = None
_admin_crawl_cid = None

_seed_file_id = None


def test_crawl_config_usernames(
crawler_auth_headers, default_org_id, crawler_config_id
Expand Down Expand Up @@ -978,7 +976,7 @@ def test_add_crawl_config_with_seed_file(
assert data["config"]["seeds"] is None


def test_delete_in_use_seed_file(
def test_delete_seed_file_in_use_crawlconfig(
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
):
# Attempt to delete in-use seed file, verify we get 400 response
Expand Down
88 changes: 84 additions & 4 deletions backend/test/test_run_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
# (not using the fixture to be able to test running crawl)
admin_crawl_id = None

seed_file_crawl_id = None


def test_list_orgs(admin_auth_headers, default_org_id):
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
Expand Down Expand Up @@ -1377,12 +1379,14 @@ def test_seed_file_crawl(
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawl_id = r.json()["started"]

global seed_file_crawl_id
seed_file_crawl_id = r.json()["started"]

# Wait for it to complete
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{seed_file_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
Expand All @@ -1394,7 +1398,7 @@ def test_seed_file_crawl(

# Check on crawl
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{seed_file_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 200
Expand All @@ -1405,7 +1409,7 @@ def test_seed_file_crawl(

# Validate crawl pages
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/pages",
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{seed_file_crawl_id}/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
Expand All @@ -1416,3 +1420,79 @@ def test_seed_file_crawl(
"https://specs.webrecorder.net/",
"https://webrecorder.net/",
)


def test_delete_seed_file_in_use_crawl(
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
):
# Remove seed file from workflow
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{seed_file_config_id}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": [{"url": "https://webrecorder.net"}],
"scopeType": "page",
"limit": 1,
"seedFileId": None,
}
},
)
assert r.status_code == 200

data = r.json()
assert data["updated"]
assert data["metadata_changed"] == False
assert data["settings_changed"] == True

# Verify seed file was removed
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{seed_file_config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["config"]["seedFileId"] is None

# Attempt to delete seed file, ensure we get 400 response
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "seed_file_in_use"

r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["id"] == seed_file_id


def test_delete_seed_file_not_in_use(
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
):
# Delete crawl with seed file id so it's no longer in use
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [seed_file_crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 1

# Delete seed file
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]

r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 404
10 changes: 4 additions & 6 deletions backend/test/test_uploads.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,7 +607,7 @@ def test_get_all_crawls_by_type(
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 7
assert data["total"] == 6
for item in data["items"]:
assert item["type"] == "crawl"

Expand Down Expand Up @@ -639,7 +639,7 @@ def test_get_all_crawls_by_user(
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 6
assert data["total"] == 5
for item in data["items"]:
assert item["userid"] == crawler_userid

Expand Down Expand Up @@ -823,15 +823,14 @@ def test_all_crawls_search_values(
assert r.status_code == 200
data = r.json()

assert len(data["names"]) == 9
assert len(data["names"]) == 8
expected_names = [
"Crawler User Test Crawl",
"Custom Behavior Logs",
"My Upload Updated",
"test2.wacz",
"All Crawls Test Crawl",
"Crawler User Crawl for Testing QA",
"Seed File Test Crawl",
]
for expected_name in expected_names:
assert expected_name in data["names"]
Expand All @@ -851,14 +850,13 @@ def test_all_crawls_search_values(
assert r.status_code == 200
data = r.json()

assert len(data["names"]) == 6
assert len(data["names"]) == 5
expected_names = [
"Admin Test Crawl",
"All Crawls Test Crawl",
"Crawler User Crawl for Testing QA",
"Crawler User Test Crawl",
"Custom Behavior Logs",
"Seed File Test Crawl",
]
for expected_name in expected_names:
assert expected_name in data["names"]
Expand Down
Loading