Merge pull request #21 from cedadev/add-annotations

Add annotations
cedadev · Oct 23, 2024 · b1d440f · b1d440f
2 parents 8925e9b + 21488c4
commit b1d440f
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 5 deletions.
diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml
@@ -20,7 +20,7 @@ jobs:
         docs-folder: "docs/"
         pre-build-command: "sudo apt-get -y install git"
     - name: Upload artifacts
-      uses: actions/upload-artifact@v1
+      uses: actions/upload-artifact@v4
       with:
         name: html-docs
         path: docs/build/html/

diff --git a/fbi_core/annotate.py b/fbi_core/annotate.py
@@ -0,0 +1,118 @@
+import hashlib
+import os
+import elasticsearch
+import requests
+import re
+import datetime
+from collections import defaultdict
+from .conf import APIKEY
+from ceda_es_client import CEDAElasticsearchClient
+
+if APIKEY:
+    es = CEDAElasticsearchClient(headers={'x-api-key': APIKEY})
+else:
+    es = CEDAElasticsearchClient()
+
+indexname = "fbi-annotations"
+
+
+def get_moles_records():
+    r = requests.get("https://catalogue.ceda.ac.uk/api/v2/observations.json/?fields=uuid,title,result_field,status,observationcollection_set&limit=20000")
+
+    # map collections to paths
+    observation_records = {}
+    collections_paths = defaultdict(list)
+    for ob_record in r.json()["results"]:
+        # skip records with no results field
+        if "result_field" not in ob_record or ob_record["result_field"] is None:
+            continue
+        # skip records with bad data path
+        if "dataPath" not in ob_record["result_field"] or not ob_record["result_field"]["dataPath"].startswith("/"):
+            continue
+        data_path = ob_record["result_field"]["dataPath"]
+
+        # description records are observation records mostly
+        observation_records[data_path] = ob_record
+
+        for collection_url in ob_record["observationcollection_set"]:
+            coll_id = int(re.search(r'/(\d+)\.json$', collection_url).group(1))
+            collections_paths[coll_id].append(data_path)
+
+    # find collections common paths
+    collections_common_paths = {}
+    for coll_id, paths in collections_paths.items():
+        collections_common_paths[coll_id] = os.path.commonpath(paths)
+
+    # invert the dict
+    common_paths_collections = defaultdict(list)
+    for coll_id, common_path in collections_common_paths.items():
+        common_paths_collections[common_path].append(coll_id)
+
+    # find unambiguous common paths to map a common path to single collection
+    collections_by_path = {}
+    for common_path, collections_list in common_paths_collections.items():
+        if len(collections_list) == 1:
+            collections_by_path[common_path] = collections_list[0]
+
+    # grab collections records
+    r = requests.get("https://catalogue.ceda.ac.uk/api/v2/observationcollections.json/?limit=10000&fields=ob_id,uuid,title,publicationState")
+    collection_records_by_obid = {}
+    for collection_rec in r.json()["results"]:
+        collection_records_by_obid[collection_rec["ob_id"]] = collection_rec
+
+    #  swap collection id for full collection record
+    for path, coll_id in collections_by_path.items():
+        collections_by_path[path] = collection_records_by_obid[coll_id]
+
+    return observation_records, collections_by_path  
+
+def insert_annotation(path, annotation_type, record, process=None):
+    """Insert record by replaceing it"""
+    path = os.path.normpath(path)
+    key = path + "|" + annotation_type
+    record_id = hashlib.sha1(key.encode()).hexdigest()
+    annotation_record = {"added_date": datetime.datetime.now().isoformat(),
+                         "under": path, 
+                         "annotation":  {annotation_type: record}}   
+    if isinstance(process, str):
+        annotation_record["added_process"] = process
+
+    try:
+        es.delete(index=indexname, id=record_id)
+    except elasticsearch.exceptions.NotFoundError:
+        pass
+    es.index(index=indexname, id=record_id, body=annotation_record, request_timeout=100) 
+
+
+def get_fbi_annotations(path): 
+    pass
+
+def lookup(path):
+
+    path = os.path.normpath(path)
+    matches = []
+    while path != "/":
+        print(path)
+        if path in records:
+            matches.insert(0, records[path])
+        path = os.path.dirname(path)
+
+    combined_rec = {}
+    for rec in matches:
+        combined_rec.update(rec)
+    return combined_rec
+
+def grab_moles():
+
+    obs, collections = get_moles_records()
+    for ob_path, ob_rec in obs.items():
+        print(ob_path, ob_rec) 
+        insert_annotation(ob_path, "observation", ob_rec)
+
+    for coll_path, coll_rec in collections.items():
+        print(coll_path, coll_rec)
+        insert_annotation(coll_path, "collection", coll_rec)
+
+
+
+
diff --git a/fbi_core/fbi_tools.py b/fbi_core/fbi_tools.py
@@ -66,9 +66,10 @@ def fbi_records_under(path="/", fetch_size=10000, exclude_phenomena=False, **kwa
         query["_source"] = {"exclude": ["phenomena"]}
     query["sort"] = [{ "path.keyword": "asc" }]
     query["size"] = fetch_size
-
+    query["search_after"] = search_after
+
     while True:
-        result = es.search(index=indexname, body=query, request_timeout=900, search_after=search_after)
+        result = es.search(index=indexname, body=query, request_timeout=900)
         nfound = len(result["hits"]["hits"])
         if nfound == 0 and current_scope == path:
             break
@@ -82,10 +83,11 @@ def fbi_records_under(path="/", fetch_size=10000, exclude_phenomena=False, **kwa
             current_scope = "/".join(lastpath.split("/")[:current_scope_depth+1])
         query = all_under_query(current_scope, **kwargs)
         query["sort"] = [{ "path.keyword": "asc" }]
-        query["size"] = fetch_size    
+        query["size"] = fetch_size
         n += nfound
         if len(result["hits"]["hits"]) > 0:
             search_after = result["hits"]["hits"][-1]["sort"]
+        query["search_after"] = search_after
 
         for record in result["hits"]["hits"]:
             yield record["_source"]

diff --git a/setup.py b/setup.py
@@ -42,7 +42,7 @@
     entry_points={
         'console_scripts': [
             'fbi_filesize=fbi_core.fbi_filesize:summary',
-            'fbi_ls=fbi_core.fbi_filesize:ls',
+            'fbi_ls=fbi_core.fbi_filesize:ls2',
             'fbi_random=fbi_core.fbi_filesize:random_paths',
             'fbi_show_record=fbi_core.fbi_filesize:show_record',
             'fbi_parameters=fbi_core.fbi_filesize:show_parameters',
@@ -56,6 +56,7 @@
             'fbi_launch_run=fbi_core.fbi_dump:launch_run',
             'fbi_batch_run=fbi_core.fbi_dump:batch_run',
             'fbi_links_to=fbi_core.fbi_filesize:find_links_to',
+            'fbi_annotate=fbi_core.annotate:grab_moles',
         ],
     },
 )