as found

cedadev · Dec 19, 2024 · c5f9c90 · c5f9c90
1 parent 21488c4
commit c5f9c90
Show file tree

Hide file tree

Showing 5 changed files with 464 additions and 206 deletions.
diff --git a/IDEAS.md b/IDEAS.md
@@ -109,3 +109,42 @@ while not selected:
     This is a good selection  
     selected == C < T and C > kT or reached the end
 
+
+# An FBI API
+
+function that give a list of records:
+
+fbi_records
+fbi_records_under
+where_is
+ls_query
+all_under_query
+links_to
+get_random_records
+fbi_listdir
+def get_records_by_content
+nla_dirs
+
+stats of records:
+count
+lastest_file
+last_updated
+archive_summary
+parameters
+
+utils:
+convert2datetime
+splits
+_create_id
+
+Write records:
+make_dirs
+insert_item
+update_item
+flag_removed
+bulk_update
+update_file_location
+
+Get single record:
+get_record, get_record_attr
+
diff --git a/fbi_core/annotate.py b/fbi_core/annotate.py
@@ -1,23 +1,29 @@
+import datetime
 import hashlib
 import os
-import elasticsearch
-import requests
 import re
-import datetime
 from collections import defaultdict
-from .conf import APIKEY
+
+import elasticsearch
+import requests
 from ceda_es_client import CEDAElasticsearchClient
 
+from .conf import APIKEY
+
 if APIKEY:
-    es = CEDAElasticsearchClient(headers={'x-api-key': APIKEY})
+    es = CEDAElasticsearchClient(headers={"x-api-key": APIKEY})
 else:
     es = CEDAElasticsearchClient()
-    
+
 indexname = "fbi-annotations"
 
 
 def get_moles_records():
-    r = requests.get("https://catalogue.ceda.ac.uk/api/v2/observations.json/?fields=uuid,title,result_field,status,observationcollection_set&limit=20000")
+    """get moles info from catalogue"""
+    obs_url = "https://catalogue.ceda.ac.uk/api/v2/observations.json"
+    obs_url += "/?fields=uuid,title,result_field,status,observationcollection_set"
+    obs_url += "&limit=20000"
+    r = requests.get(obs_url, timeout=300)
 
     # map collections to paths
     observation_records = {}
@@ -27,15 +33,17 @@ def get_moles_records():
         if "result_field" not in ob_record or ob_record["result_field"] is None:
             continue
         # skip records with bad data path
-        if "dataPath" not in ob_record["result_field"] or not ob_record["result_field"]["dataPath"].startswith("/"):
+        if "dataPath" not in ob_record["result_field"] or not ob_record["result_field"][
+            "dataPath"
+        ].startswith("/"):
             continue
         data_path = ob_record["result_field"]["dataPath"]
- 
+
         # description records are observation records mostly
         observation_records[data_path] = ob_record
 
         for collection_url in ob_record["observationcollection_set"]:
-            coll_id = int(re.search(r'/(\d+)\.json$', collection_url).group(1))
+            coll_id = int(re.search(r"/(\d+)\.json$", collection_url).group(1))
             collections_paths[coll_id].append(data_path)
 
     # find collections common paths
@@ -55,7 +63,11 @@ def get_moles_records():
             collections_by_path[common_path] = collections_list[0]
 
     # grab collections records
-    r = requests.get("https://catalogue.ceda.ac.uk/api/v2/observationcollections.json/?limit=10000&fields=ob_id,uuid,title,publicationState")
+    coll_url = "https://catalogue.ceda.ac.uk/api/v2/observationcollections.json"
+    coll_url += "/?fields=ob_id,uuid,title,publicationState"
+    coll_url += "&limit=10000"
+    r = requests.get(coll_url, timeout=200)
+
     collection_records_by_obid = {}
     for collection_rec in r.json()["results"]:
         collection_records_by_obid[collection_rec["ob_id"]] = collection_rec
@@ -64,29 +76,33 @@ def get_moles_records():
     for path, coll_id in collections_by_path.items():
         collections_by_path[path] = collection_records_by_obid[coll_id]
 
-    return observation_records, collections_by_path  
+    return observation_records, collections_by_path
+
 
 def insert_annotation(path, annotation_type, record, process=None):
     """Insert record by replaceing it"""
     path = os.path.normpath(path)
     key = path + "|" + annotation_type
     record_id = hashlib.sha1(key.encode()).hexdigest()
-    annotation_record = {"added_date": datetime.datetime.now().isoformat(),
-                         "under": path, 
-                         "annotation":  {annotation_type: record}}   
+    annotation_record = {
+        "added_date": datetime.datetime.now().isoformat(),
+        "under": path,
+        "annotation": {annotation_type: record},
+    }
     if isinstance(process, str):
         annotation_record["added_process"] = process
 
     try:
         es.delete(index=indexname, id=record_id)
     except elasticsearch.exceptions.NotFoundError:
         pass
-    es.index(index=indexname, id=record_id, body=annotation_record, request_timeout=100) 
+    es.index(index=indexname, id=record_id, body=annotation_record, request_timeout=100)
 
 
-def get_fbi_annotations(path): 
+def get_fbi_annotations(path):
     pass
 
+
 def lookup(path):
 
     path = os.path.normpath(path)
@@ -96,23 +112,20 @@ def lookup(path):
         if path in records:
             matches.insert(0, records[path])
         path = os.path.dirname(path)
-    
+
     combined_rec = {}
     for rec in matches:
         combined_rec.update(rec)
     return combined_rec
 
+
 def grab_moles():
 
     obs, collections = get_moles_records()
     for ob_path, ob_rec in obs.items():
-        print(ob_path, ob_rec) 
+        print(ob_path, ob_rec)
         insert_annotation(ob_path, "observation", ob_rec)
 
     for coll_path, coll_rec in collections.items():
         print(coll_path, coll_rec)
         insert_annotation(coll_path, "collection", coll_rec)
-
-
-
-