15
15
import pandas as pd
16
16
import json
17
17
import os .path
18
+ from typing import List
18
19
19
20
from PyLagoon .config import LagoonConfig
20
21
from PyLagoon .postgresql import PGMeta , build_sql_query
@@ -36,9 +37,10 @@ def __init__(self, config, host=None, port=None):
36
37
self .__cookies = self .__connect ()
37
38
38
39
def __connect (self ):
39
- reply = req .post (self .conn_str + "user/login" ,
40
- json = {"user" : self .__cfg .USER ,
41
- "pass" : self .__cfg .PASSWORD })
40
+ reply = req .post (
41
+ self .conn_str + "user/login" ,
42
+ json = {"user" : self .__cfg .USER , "pass" : self .__cfg .PASSWORD },
43
+ )
42
44
if reply .ok :
43
45
return reply .cookies
44
46
else :
@@ -61,9 +63,7 @@ def sources(self, ontoClass=None, tags=None, columns=None, **kwargs):
61
63
kwargs ["tag" ] = tags
62
64
if columns :
63
65
kwargs ["columns" ] = columns
64
- reply = req .get (self .conn_str + "sources" ,
65
- params = kwargs ,
66
- cookies = self .__cookies )
66
+ reply = req .get (self .conn_str + "sources" , params = kwargs , cookies = self .__cookies )
67
67
return [Source (j ) for j in reply .json ()]
68
68
69
69
def ingest (self , file_path , name , ontoClass = None , tags = None , ** kwargs ):
@@ -79,12 +79,14 @@ def ingest(self, file_path, name, ontoClass=None, tags=None, **kwargs):
79
79
kwargs ["tag" ] = tags
80
80
kwargs ["name" ] = name
81
81
kwargs ["input" ] = os .path .split (file_path )[1 ]
82
- # So the server can guess the fileType
83
- reply = req .post (self .conn_str + "sources" ,
84
- data = open (file_path , "rb" ),
85
- params = kwargs ,
86
- stream = True ,
87
- cookies = self .__cookies )
82
+ # So the server can guess the fileType
83
+ reply = req .post (
84
+ self .conn_str + "sources" ,
85
+ data = open (file_path , "rb" ),
86
+ params = kwargs ,
87
+ stream = True ,
88
+ cookies = self .__cookies ,
89
+ )
88
90
report = (json .loads (line .decode ("utf-8" )) for line in reply .raw )
89
91
stack = []
90
92
last = None
@@ -116,38 +118,67 @@ def users(self):
116
118
reply = req .get (self .conn_str + "users" )
117
119
return reply .json ()
118
120
119
- def tbl (self , source = None , query = None ):
120
- """tbl() in RLagoon
121
-
122
- Give one of source or query.
123
-
124
- source is a Source, query is an sqlalchemy.orm.query.Query created
125
- through use of PyLagoon.postgresql.PGMeta and the sqlalchemy EDSL.
121
+ def download_source (self , source ):
122
+ """Constructs a DataFrame containing an entire source
126
123
"""
124
+ is_json = any (c ["type" ][0 ] == "JSON" for c in source .columns )
125
+ if is_json :
126
+ # We need a JSON document in that case
127
+ # the sql endpoint will return one
128
+ meta = PGMeta ([source ])
129
+ table = meta [source ]
130
+ return self .__tbl_from_raw_sql (build_sql_query (meta .query (table )))
131
+ else :
132
+ reply = req .get (
133
+ self .conn_str + "source/" + str (source .ix ) + "/download" ,
134
+ stream = True ,
135
+ cookies = self .__cookies ,
136
+ )
137
+ if reply .ok :
138
+ return pd .read_csv (reply .text )
139
+
140
+ def download_query (self , query , sources ):
141
+ """Constructs a DataFrame from a SQLAlchemy query and corresponding sources
142
+
143
+ Note that this method will sequentially search for each columns type in the list
144
+ of sources and take the first match. This is necessary since query results only
145
+ include column names and not data source identifiers.
146
+ """
147
+ return self .__tbl_from_raw_sql (build_sql_query (query ), sources )
127
148
128
- if source :
129
- is_json = any (c ["type" ][0 ] == "JSON" for c in source .columns )
130
- if is_json :
131
- # We need a JSON document in that case
132
- # the sql endpoint will return one
133
- meta = PGMeta ([source ])
134
- table = meta [source ]
135
- return self .__tbl_from_raw_sql (build_sql_query (meta .query (table )))
136
- else :
137
- reply = req .get (self .conn_str + "source/" + str (source .ix ) + "/download" ,
138
- stream = True ,
139
- cookies = self .__cookies )
140
- if reply .ok :
141
- return pd .read_csv (reply .text )
142
- elif query :
143
- return self .__tbl_from_raw_sql (build_sql_query (query ))
144
-
145
- def __tbl_from_raw_sql (self , query ):
149
+ def __tbl_from_raw_sql (self , query , sources ):
146
150
reply = req .post (
147
- self .conn_str + "sql" ,
148
- json = {"sql" : query },
149
- stream = True ,
150
- cookies = self .__cookies )
151
- if reply .ok :
152
- # return pd.read_json(reply.raw)
153
- return pd .DataFrame (reply .json ())
151
+ self .conn_str + "sql" , json = {"sql" : query }, stream = True , cookies = self .__cookies
152
+ )
153
+ reply .raise_for_status ()
154
+ return _query_to_df (reply .json (), sources )
155
+
156
+
157
+ def _group_rows (rows : List [dict ]):
158
+ columns = {}
159
+ for row in rows :
160
+ for c , v in row .items ():
161
+ if c in columns :
162
+ columns [c ].append (v )
163
+ else :
164
+ columns [c ] = [v ]
165
+ return columns
166
+
167
+
168
+ def _get_dtype (col_name , sources ):
169
+ for source in sources :
170
+ if col_name in source .col_types :
171
+ return source .col_types [col_name ]
172
+ return object
173
+
174
+
175
+ def _query_to_df (rows , sources ):
176
+ grouped = _group_rows (rows )
177
+ col_names = list (grouped .keys ())
178
+ series = []
179
+ for name in col_names :
180
+ vals = grouped .pop (name )
181
+ series .append (pd .Series (vals , name = name , dtype = _get_dtype (name , sources )))
182
+ df = pd .concat (series , axis = 1 )
183
+ df .columns = col_names
184
+ return df
0 commit comments