Skip to content

Commit

Permalink
as found
Browse files Browse the repository at this point in the history
  • Loading branch information
spepler committed Dec 19, 2024
1 parent 21488c4 commit c5f9c90
Show file tree
Hide file tree
Showing 5 changed files with 464 additions and 206 deletions.
39 changes: 39 additions & 0 deletions IDEAS.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,42 @@ while not selected:
This is a good selection
selected == C < T and C > kT or reached the end


# An FBI API

function that give a list of records:

fbi_records
fbi_records_under
where_is
ls_query
all_under_query
links_to
get_random_records
fbi_listdir
def get_records_by_content
nla_dirs

stats of records:
count
lastest_file
last_updated
archive_summary
parameters

utils:
convert2datetime
splits
_create_id

Write records:
make_dirs
insert_item
update_item
flag_removed
bulk_update
update_file_location

Get single record:
get_record, get_record_attr

59 changes: 36 additions & 23 deletions fbi_core/annotate.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
import datetime
import hashlib
import os
import elasticsearch
import requests
import re
import datetime
from collections import defaultdict
from .conf import APIKEY

import elasticsearch
import requests
from ceda_es_client import CEDAElasticsearchClient

from .conf import APIKEY

if APIKEY:
es = CEDAElasticsearchClient(headers={'x-api-key': APIKEY})
es = CEDAElasticsearchClient(headers={"x-api-key": APIKEY})
else:
es = CEDAElasticsearchClient()

indexname = "fbi-annotations"


def get_moles_records():
r = requests.get("https://catalogue.ceda.ac.uk/api/v2/observations.json/?fields=uuid,title,result_field,status,observationcollection_set&limit=20000")
"""get moles info from catalogue"""
obs_url = "https://catalogue.ceda.ac.uk/api/v2/observations.json"
obs_url += "/?fields=uuid,title,result_field,status,observationcollection_set"
obs_url += "&limit=20000"
r = requests.get(obs_url, timeout=300)

# map collections to paths
observation_records = {}
Expand All @@ -27,15 +33,17 @@ def get_moles_records():
if "result_field" not in ob_record or ob_record["result_field"] is None:
continue
# skip records with bad data path
if "dataPath" not in ob_record["result_field"] or not ob_record["result_field"]["dataPath"].startswith("/"):
if "dataPath" not in ob_record["result_field"] or not ob_record["result_field"][
"dataPath"
].startswith("/"):
continue
data_path = ob_record["result_field"]["dataPath"]

# description records are observation records mostly
observation_records[data_path] = ob_record

for collection_url in ob_record["observationcollection_set"]:
coll_id = int(re.search(r'/(\d+)\.json$', collection_url).group(1))
coll_id = int(re.search(r"/(\d+)\.json$", collection_url).group(1))
collections_paths[coll_id].append(data_path)

# find collections common paths
Expand All @@ -55,7 +63,11 @@ def get_moles_records():
collections_by_path[common_path] = collections_list[0]

# grab collections records
r = requests.get("https://catalogue.ceda.ac.uk/api/v2/observationcollections.json/?limit=10000&fields=ob_id,uuid,title,publicationState")
coll_url = "https://catalogue.ceda.ac.uk/api/v2/observationcollections.json"
coll_url += "/?fields=ob_id,uuid,title,publicationState"
coll_url += "&limit=10000"
r = requests.get(coll_url, timeout=200)

collection_records_by_obid = {}
for collection_rec in r.json()["results"]:
collection_records_by_obid[collection_rec["ob_id"]] = collection_rec
Expand All @@ -64,29 +76,33 @@ def get_moles_records():
for path, coll_id in collections_by_path.items():
collections_by_path[path] = collection_records_by_obid[coll_id]

return observation_records, collections_by_path
return observation_records, collections_by_path


def insert_annotation(path, annotation_type, record, process=None):
"""Insert record by replaceing it"""
path = os.path.normpath(path)
key = path + "|" + annotation_type
record_id = hashlib.sha1(key.encode()).hexdigest()
annotation_record = {"added_date": datetime.datetime.now().isoformat(),
"under": path,
"annotation": {annotation_type: record}}
annotation_record = {
"added_date": datetime.datetime.now().isoformat(),
"under": path,
"annotation": {annotation_type: record},
}
if isinstance(process, str):
annotation_record["added_process"] = process

try:
es.delete(index=indexname, id=record_id)
except elasticsearch.exceptions.NotFoundError:
pass
es.index(index=indexname, id=record_id, body=annotation_record, request_timeout=100)
es.index(index=indexname, id=record_id, body=annotation_record, request_timeout=100)


def get_fbi_annotations(path):
def get_fbi_annotations(path):
pass


def lookup(path):

path = os.path.normpath(path)
Expand All @@ -96,23 +112,20 @@ def lookup(path):
if path in records:
matches.insert(0, records[path])
path = os.path.dirname(path)

combined_rec = {}
for rec in matches:
combined_rec.update(rec)
return combined_rec


def grab_moles():

obs, collections = get_moles_records()
for ob_path, ob_rec in obs.items():
print(ob_path, ob_rec)
print(ob_path, ob_rec)
insert_annotation(ob_path, "observation", ob_rec)

for coll_path, coll_rec in collections.items():
print(coll_path, coll_rec)
insert_annotation(coll_path, "collection", coll_rec)




Loading

0 comments on commit c5f9c90

Please sign in to comment.