chore(preprocess): convenient preprocess; better statistics displaying

bwdeng20 · bwdeng20 · commit 995715c39af9 · 2023-08-30T21:25:07.000+08:00
i) preprocess all datasets with one line.
ii) display the data
statistic in a more concise table way
diff --git a/preprocess_data/data_statistics.py b/preprocess_data/data_statistics.py
@@ -1,14 +1,25 @@
 import numpy as np
+import pandas as pd
+from tabulate import tabulate
+
+
+def pprint_df(df, tablefmt='psql'):
+    print(tabulate(df, headers='keys', tablefmt=tablefmt))
+
 
 if __name__ == "__main__":
-    for dataset_name in ['wikipedia', 'reddit', 'mooc', 'lastfm', 'myket', 'enron', 'SocialEvo', 'uci',
-                         'Flights', 'CanParl', 'USLegis', 'UNtrade', 'UNvote', 'Contacts']:
+    all_datasets = ['wikipedia', 'reddit', 'mooc', 'lastfm', 'myket', 'enron', 'SocialEvo', 'uci',
+                    'Flights', 'CanParl', 'USLegis', 'UNtrade', 'UNvote', 'Contacts']
+    records = []
+    for dataset_name in sorted(all_datasets, key=lambda v: v.upper()):
         edge_raw_features = np.load('../processed_data/{}/ml_{}.npy'.format(dataset_name, dataset_name))
         node_raw_features = np.load('../processed_data/{}/ml_{}_node.npy'.format(dataset_name, dataset_name))
+        info = {'name': dataset_name,
+                'num_nodes': node_raw_features.shape[0] - 1,
+                'node_fea_dim': node_raw_features.shape[-1],
+                'num_edges': edge_raw_features.shape[0] - 1,
+                'edge_fea_dim': edge_raw_features.shape[-1]}
+        records.append(info)
 
-        print('Statistics of dataset ', dataset_name)
-        print('number of nodes ', node_raw_features.shape[0] - 1)
-        print('number of node features ', node_raw_features.shape[1])
-        print('number of edges ', edge_raw_features.shape[0] - 1)
-        print('number of edge features ', edge_raw_features.shape[1])
-        print('====================================')
+    info_df = pd.DataFrame.from_records(records)
+    pprint_df(info_df)
diff --git a/preprocess_data/preprocess_all_data.py b/preprocess_data/preprocess_all_data.py
@@ -0,0 +1,5 @@
+import os
+
+for name in ['wikipedia', 'reddit', 'mooc', 'lastfm', 'enron', 'SocialEvo', 'myket',
+             'uci', 'Flights', 'CanParl', 'USLegis', 'UNtrade', 'UNvote', 'Contacts']:
+    os.system(f'python preprocess_data.py  --dataset_name {name}')
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+torch>=1.8.1
+numpy
+pandas
+tqdm
+tabulate