-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmisc_json_formats.py
112 lines (89 loc) · 2.55 KB
/
misc_json_formats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.16.2
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# %% [markdown]
# # Json Formats
#
# - object is loaded with the correct conversions (but this is re-computed)
# - can shared information be saved as "meta" information?
#
# - [`pd.json_normalize`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) should be able to efficiently combine information
# %%
import pandas as pd
from pimmslearn.io.data_objects import MqAllSummaries
from pimmslearn.pandas import get_unique_non_unique_columns
mq_all_summaries = MqAllSummaries()
# %% [markdown]
# ## summaries.json
# %% [markdown]
# ### Table format with schema
# %%
# json format with categories
columns = get_unique_non_unique_columns(mq_all_summaries.df)
columns.unique[:2]
# %%
mq_all_summaries.df[columns.unique[:3]].dtypes
# %%
type(mq_all_summaries.df.iloc[0,3])
# %%
meta = mq_all_summaries.df[columns.unique].iloc[0].to_json(indent=4, orient='table')
# print(meta)
# %%
pd.read_json(meta, orient='table').T.convert_dtypes()
# %%
pd.read_json(meta, orient='table') # produce errors when having int columns has NaN
# %%
pd.options.display.max_columns = len(columns.non_unique)
# mq_all_summaries.df[columns.non_unique]
# %%
data = mq_all_summaries.df[columns.non_unique].iloc[0:3].to_json()
data = pd.read_json(data)
data
# %%
mq_all_summaries.fp_summaries.parent / mq_all_summaries.fp_summaries.stem / '_meta.json'
# %%
meta = mq_all_summaries.df[columns.unique].iloc[0].to_json(indent=4)
meta = pd.read_json(meta, typ='series')
meta
# %%
for col, value in meta.items():
data[col] = value
# %%
data
# %% [markdown]
# ## Table schema bug
#
# - filed bug report on pandas [#40255](https://github.com/pandas-dev/pandas/issues/40255)
# %%
pd.show_versions()
# %%
pd.__version__
# %%
import traceback
import pandas
data = {'A' : [1, 2, 2, pd.NA, 4, 8, 8, 8, 8, 9],
'B': [pd.NA] * 10}
data = pd.DataFrame(data)
data = data.astype(pd.Int64Dtype()) # in my example I get this from data.convert_dtypes()
data_json = data.to_json(orient='table', indent=4)
try:
pd.read_json(data_json, orient='table') #ValueError: Cannot convert non-finite values (NA or inf) to integer
except ValueError as e:
print(e)
traceback.print_exc()
# %%
print(data.to_string())
# %%
N = 3
meta = mq_all_summaries.df[columns.unique[:N]].iloc[0:2].reset_index(drop=True)
meta.to_dict()