@@ -17,7 +17,7 @@ def extract_kaggle(file_path):
17
17
kaggle_identifiers = [f"{ username } /{ dataset } " for username , dataset in matches ]
18
18
return kaggle_identifiers
19
19
20
- dir = "_datasets"
20
+ dir = "blastnet.github.io/ _datasets"
21
21
total_bytes = 0
22
22
total_size = 0
23
23
json_dump = {}
@@ -75,25 +75,33 @@ def format_bytes(num_bytes):
75
75
num_bytes /= factor
76
76
unit_index += 1
77
77
return f"{ num_bytes :.3f} { units [unit_index ]} "
78
+ def unformat_bytes (string ):
79
+ units = ['B' ,'KB' ,'MB' ,'GB' ,'TB' ,'PB' ,'EB' ,'ZB' ,'YB' ]
80
+ num ,unit = string .split (" " )
81
+ factor = 1000
82
+ return float (num )* (factor ** (units .index (unit )))
78
83
79
84
# SPECIFIC DATASET STATISTICS TO OUTPUT
80
85
# Take the maximum of views/downloads from each of the sub-datasets
81
86
# More representative than summing, since the same user would likely view multiple sub-datasets
82
- ds_size = format_bytes (np .sum (size_in_bytes ))
87
+ ds_size_raw = np .sum (size_in_bytes )
88
+ ds_size = format_bytes (ds_size_raw )
83
89
ds_views = np .max (views ) #np.sum(views)
84
90
ds_downs = np .max (downloads ) #np.sum(downloads)
85
91
print (f'{ filename } ({ ds_size } ) processed. { ds_views } views, { ds_downs } downloads.' )
86
92
87
- # Use old data as fallback
88
- if not ds_size :
93
+ if not ds_size_raw :
94
+ # Use old data as fallback
89
95
kaggle_stats = json .loads (gist .files ['kaggle_stats.json' ].content )
90
96
kaggle_stats = kaggle_stats [filename ]
91
-
92
- # Save as dictionary and throw it to the preamble
93
- kaggle_stats = {
94
- 'size' : ds_size ,
95
- 'views' : ds_views ,
96
- 'downloads' : ds_downs ,
97
+ size_in_bytes = unformat_bytes (kaggle_stats ['size' ])
98
+ downloads = kaggle_stats ['downloads' ]
99
+ else :
100
+ # Save as dictionary and throw it to the preamble
101
+ kaggle_stats = {
102
+ 'size' : ds_size ,
103
+ 'views' : ds_views ,
104
+ 'downloads' : ds_downs ,
97
105
}
98
106
json_dump [filename ] = kaggle_stats
99
107
total_bytes += int (np .sum (downloads * size_in_bytes ))
@@ -102,6 +110,9 @@ def format_bytes(num_bytes):
102
110
if not total_bytes :
103
111
raise Exception ("Zero data encountered, exiting" )
104
112
exit ()
113
+ #old_data = json.loads(gist.files['kaggle_stats.json'].content)
114
+ #total_bytes = old_data['total_bytes']
115
+ #total_size = old_data['total_size']
105
116
106
117
json_dump ['total_bytes' ] = total_bytes
107
118
json_dump ['total_size' ] = total_size
0 commit comments