Merge pull request #26 from 4dn-dcic/0.4.6

0.4.6
4dn-dcic · Aug 20, 2018 · e9519fb · e9519fb
2 parents 7017b17 + 1fa0c85
commit e9519fb
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 19 deletions.
diff --git a/dcicutils/_version.py b/dcicutils/_version.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.4.5"
+__version__ = "0.4.6"
diff --git a/dcicutils/beanstalk_utils.py b/dcicutils/beanstalk_utils.py
@@ -91,6 +91,10 @@ def delete_db(db_identifier, take_snapshot=True):
 
 
 def get_health_page_info(bs_url):
+    """
+    Different use cases than ff_utils.get_health_page (that one is oriented
+    towards external API usage and this one is more internal)
+    """
     if not bs_url.endswith('/'):
         bs_url += "/"
     if not bs_url.startswith('http'):

diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
@@ -351,27 +351,44 @@ def get_es_search_generator(es_client, index, body, page_size=50):
 def get_es_metadata(uuids, es_client=None, key=None, ff_env=None):
     """
     Given a list of string item uuids, will return a
-    dictionary response of the full ES ecord for that item (or an empty
-    dictionary if the item doesn't exist/ is not indexed)
+    dictionary response of the full ES record for those items (or an empty
+    dictionary if the items don't exist/ are not indexed)
     You can pass in an Elasticsearch client (initialized by create_es_client)
     through the es_client param to save init time.
     Same auth mechanism as the other metadata functions
     """
     if es_client is None:
-        # need to know ES server location and item type
-        auth = get_authentication_with_server(key, ff_env)
-        health_res = authorized_request(auth['server'] + '/health', auth=auth, verb='GET')
-        es_url = get_response_json(health_res)['elasticsearch']
+        es_url = get_health_page(key, ff_env)['elasticsearch']
         es_client = es_utils.create_es_client(es_url, use_aws_auth=True)
     # match all given uuids to _id fields
-    es_query = {'query': {'terms': {'_id': uuids}}, 'sort': [{'_uid': {'order': 'desc'}}]}
+    # sending in too many uuids in the terms query can crash es; break them up
+    # into groups of max size 100
     es_res = []
-    for es_page in get_es_search_generator(es_client, '_all', es_query):
-        # return the document source only; eliminate es metadata
-        es_res.extend([hit['_source'] for hit in es_page])
+    for i in range(0, len(uuids), 100):
+        query_uuids = uuids[i:i + 100]
+        es_query = {'query': {'terms': {'_id': query_uuids}},
+                    'sort': [{'_uid': {'order': 'desc'}}]}
+        for es_page in get_es_search_generator(es_client, '_all', es_query):
+            # return the document source only; eliminate es metadata
+            es_res.extend([hit['_source'] for hit in es_page])
     return es_res
 
 
+def get_health_page(key=None, ff_env=None):
+    """
+    Simple function to return the json for a FF health page given keys or
+    ff_env. Will return json containing an error rather than raising an
+    exception if this fails, since this function should tolerate failure
+    """
+    try:
+        auth = get_authentication_with_server(key, ff_env)
+        health_res = authorized_request(auth['server'] + '/health', auth=auth, verb='GET')
+        ret = get_response_json(health_res)
+    except Exception as exc:
+        ret = {'error': str(exc)}
+    return ret
+
+
 #####################
 # Utility functions #
 #####################

diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py
@@ -350,11 +350,8 @@ def test_get_es_metadata(integrated_ff):
     assert biosample_res['uuid'] == test_biosample
     assert biosample_res['item_type'] == 'biosample'
 
-    # you can also pass in your own elasticsearch client
-    # ugly here because we need to get it from health page
-    health_res = ff_utils.authorized_request(integrated_ff['ff_key']['server'] + '/health',
-                                             auth=integrated_ff['ff_key'])
-    es_url = ff_utils.get_response_json(health_res)['elasticsearch']
+    # you can pass in your own elasticsearch client or build it here
+    es_url = ff_utils.get_health_page(key=integrated_ff['ff_key'])['elasticsearch']
     es_client = es_utils.create_es_client(es_url, use_aws_auth=True)
     res2 = ff_utils.get_es_metadata([test_biosource], es_client=es_client,
                                     key=integrated_ff['ff_key'])
@@ -377,13 +374,19 @@ def test_get_es_metadata(integrated_ff):
     res = ff_utils.get_es_metadata(['blahblah'], key=integrated_ff['ff_key'])
     assert res == []
 
+    # make sure searches work with pagination set at 100 (default)
+    all_items = ff_utils.search_metadata('/search/?type=Item&frame=object', key=integrated_ff['ff_key'])
+    all_uuids = [item['uuid'] for item in all_items]
+    all_es = ff_utils.get_es_metadata(all_uuids, key=integrated_ff['ff_key'])
+    assert len(all_es) == len(all_uuids)
+    all_es_uuids = [item['uuid'] for item in all_es]
+    assert set(all_es_uuids) == set(all_uuids)
+
 
 def test_get_es_search_generator(integrated_ff):
     from dcicutils import es_utils
     # get es_client info from the health page
-    health_res = ff_utils.authorized_request(integrated_ff['ff_key']['server'] + '/health',
-                                             auth=integrated_ff['ff_key'])
-    es_url = ff_utils.get_response_json(health_res)['elasticsearch']
+    es_url = ff_utils.get_health_page(key=integrated_ff['ff_key'])['elasticsearch']
     es_client = es_utils.create_es_client(es_url, use_aws_auth=True)
     es_query = {'query': {'match_all': {}}, 'sort': [{'_uid': {'order': 'desc'}}]}
     # search for all ontology terms with a low pagination size
@@ -402,3 +405,18 @@ def test_get_es_search_generator(integrated_ff):
                                           key=integrated_ff['ff_key'])
     search_uuids = set(hit['uuid'] for hit in search_res)
     assert all_es_uuids == search_uuids
+
+
+def test_get_health_page(integrated_ff):
+    health_res = ff_utils.get_health_page(key=integrated_ff['ff_key'])
+    assert health_res and 'error' not in health_res
+    assert 'elasticsearch' in health_res
+    assert 'database' in health_res
+    assert health_res['beanstalk_env'] == integrated_ff['ff_env']
+    # try with ff_env instead of key
+    health_res2 = ff_utils.get_health_page(ff_env=integrated_ff['ff_env'])
+    assert health_res2 and 'error' not in health_res2
+    assert health_res2['elasticsearch'] == health_res['elasticsearch']
+    # make sure it's error tolerant
+    bad_health_res = ff_utils.get_health_page(ff_env='not_an_env')
+    assert bad_health_res and 'error' in bad_health_res