Skip to content

Commit

Permalink
Merge pull request #21 from cedadev/add-annotations
Browse files Browse the repository at this point in the history
Add annotations
  • Loading branch information
spepler authored Oct 23, 2024
2 parents 8925e9b + 21488c4 commit b1d440f
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/sphinx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
docs-folder: "docs/"
pre-build-command: "sudo apt-get -y install git"
- name: Upload artifacts
uses: actions/upload-artifact@v1
uses: actions/upload-artifact@v4
with:
name: html-docs
path: docs/build/html/
Expand Down
118 changes: 118 additions & 0 deletions fbi_core/annotate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import hashlib
import os
import elasticsearch
import requests
import re
import datetime
from collections import defaultdict
from .conf import APIKEY
from ceda_es_client import CEDAElasticsearchClient

if APIKEY:
es = CEDAElasticsearchClient(headers={'x-api-key': APIKEY})
else:
es = CEDAElasticsearchClient()

indexname = "fbi-annotations"


def get_moles_records():
r = requests.get("https://catalogue.ceda.ac.uk/api/v2/observations.json/?fields=uuid,title,result_field,status,observationcollection_set&limit=20000")

# map collections to paths
observation_records = {}
collections_paths = defaultdict(list)
for ob_record in r.json()["results"]:
# skip records with no results field
if "result_field" not in ob_record or ob_record["result_field"] is None:
continue
# skip records with bad data path
if "dataPath" not in ob_record["result_field"] or not ob_record["result_field"]["dataPath"].startswith("/"):
continue
data_path = ob_record["result_field"]["dataPath"]

# description records are observation records mostly
observation_records[data_path] = ob_record

for collection_url in ob_record["observationcollection_set"]:
coll_id = int(re.search(r'/(\d+)\.json$', collection_url).group(1))
collections_paths[coll_id].append(data_path)

# find collections common paths
collections_common_paths = {}
for coll_id, paths in collections_paths.items():
collections_common_paths[coll_id] = os.path.commonpath(paths)

# invert the dict
common_paths_collections = defaultdict(list)
for coll_id, common_path in collections_common_paths.items():
common_paths_collections[common_path].append(coll_id)

# find unambiguous common paths to map a common path to single collection
collections_by_path = {}
for common_path, collections_list in common_paths_collections.items():
if len(collections_list) == 1:
collections_by_path[common_path] = collections_list[0]

# grab collections records
r = requests.get("https://catalogue.ceda.ac.uk/api/v2/observationcollections.json/?limit=10000&fields=ob_id,uuid,title,publicationState")
collection_records_by_obid = {}
for collection_rec in r.json()["results"]:
collection_records_by_obid[collection_rec["ob_id"]] = collection_rec

# swap collection id for full collection record
for path, coll_id in collections_by_path.items():
collections_by_path[path] = collection_records_by_obid[coll_id]

return observation_records, collections_by_path

def insert_annotation(path, annotation_type, record, process=None):
"""Insert record by replaceing it"""
path = os.path.normpath(path)
key = path + "|" + annotation_type
record_id = hashlib.sha1(key.encode()).hexdigest()
annotation_record = {"added_date": datetime.datetime.now().isoformat(),
"under": path,
"annotation": {annotation_type: record}}
if isinstance(process, str):
annotation_record["added_process"] = process

try:
es.delete(index=indexname, id=record_id)
except elasticsearch.exceptions.NotFoundError:
pass
es.index(index=indexname, id=record_id, body=annotation_record, request_timeout=100)


def get_fbi_annotations(path):
pass

def lookup(path):

path = os.path.normpath(path)
matches = []
while path != "/":
print(path)
if path in records:
matches.insert(0, records[path])
path = os.path.dirname(path)

combined_rec = {}
for rec in matches:
combined_rec.update(rec)
return combined_rec

def grab_moles():

obs, collections = get_moles_records()
for ob_path, ob_rec in obs.items():
print(ob_path, ob_rec)
insert_annotation(ob_path, "observation", ob_rec)

for coll_path, coll_rec in collections.items():
print(coll_path, coll_rec)
insert_annotation(coll_path, "collection", coll_rec)




8 changes: 5 additions & 3 deletions fbi_core/fbi_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,10 @@ def fbi_records_under(path="/", fetch_size=10000, exclude_phenomena=False, **kwa
query["_source"] = {"exclude": ["phenomena"]}
query["sort"] = [{ "path.keyword": "asc" }]
query["size"] = fetch_size

query["search_after"] = search_after

while True:
result = es.search(index=indexname, body=query, request_timeout=900, search_after=search_after)
result = es.search(index=indexname, body=query, request_timeout=900)
nfound = len(result["hits"]["hits"])
if nfound == 0 and current_scope == path:
break
Expand All @@ -82,10 +83,11 @@ def fbi_records_under(path="/", fetch_size=10000, exclude_phenomena=False, **kwa
current_scope = "/".join(lastpath.split("/")[:current_scope_depth+1])
query = all_under_query(current_scope, **kwargs)
query["sort"] = [{ "path.keyword": "asc" }]
query["size"] = fetch_size
query["size"] = fetch_size
n += nfound
if len(result["hits"]["hits"]) > 0:
search_after = result["hits"]["hits"][-1]["sort"]
query["search_after"] = search_after

for record in result["hits"]["hits"]:
yield record["_source"]
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
entry_points={
'console_scripts': [
'fbi_filesize=fbi_core.fbi_filesize:summary',
'fbi_ls=fbi_core.fbi_filesize:ls',
'fbi_ls=fbi_core.fbi_filesize:ls2',
'fbi_random=fbi_core.fbi_filesize:random_paths',
'fbi_show_record=fbi_core.fbi_filesize:show_record',
'fbi_parameters=fbi_core.fbi_filesize:show_parameters',
Expand All @@ -56,6 +56,7 @@
'fbi_launch_run=fbi_core.fbi_dump:launch_run',
'fbi_batch_run=fbi_core.fbi_dump:batch_run',
'fbi_links_to=fbi_core.fbi_filesize:find_links_to',
'fbi_annotate=fbi_core.annotate:grab_moles',
],
},
)

0 comments on commit b1d440f

Please sign in to comment.