update README for the new Myket dataset

yule-BUAA · yule-BUAA · commit 91e8ec7082c5 · 2023-08-31T20:32:12.000+08:00
diff --git a/README.md b/README.md
@@ -13,15 +13,22 @@ US Legis., UN Trade, UN Vote, and Contact. The first five datasets are bipartite
 
 Most of the used original dynamic graph datasets come from [Towards Better Evaluation for Dynamic Link Prediction](https://openreview.net/forum?id=1GVpwr2Tfdg), 
 which can be downloaded [here](https://zenodo.org/record/7213796#.Y1cO6y8r30o). 
-Please first download them and put them in ```DG_data``` folder. 
-Then, please run ```preprocess_data/preprocess_data.py``` for pre-processing the datasets.
+Please download them and put them in ```DG_data``` folder. 
+The Myket dataset comes from [Effect of Choosing Loss Function when Using T-batching for Representation Learning on Dynamic Networks](https://arxiv.org/abs/2308.06862) and 
+can be accessed from [here](https://github.com/erfanloghmani/myket-android-application-market-dataset). 
+The original and preprocessed files for Myket dataset are included in this repository.
+
+We can run ```preprocess_data/preprocess_data.py``` for pre-processing the datasets.
 For example, to preprocess the *Wikipedia* dataset, we can run the following commands:
 ```{bash}
 cd preprocess_data/
 python preprocess_data.py  --dataset_name wikipedia
 ```
-
-The Myket dataset comes from [Effect of Choosing Loss Function when Using T-batching for Representation Learning on Dynamic Networks](https://arxiv.org/abs/2308.06862) and can be accessed from [here](https://github.com/erfanloghmani/myket-android-application-market-dataset). The preprocessed files for this dataset are included in the repository and are located at `processed_data/myket`.
+We can also run the following commands to preprocess all the original datasets at once:
+```{bash}
+cd preprocess_data/
+python preprocess_all_data.py
+```
 
 ## Dynamic Graph Learning Models
 Eight popular continuous-time dynamic graph learning methods are included in DyGLib, including 
diff --git a/preprocess_data/data_statistics.py b/preprocess_data/data_statistics.py
@@ -14,11 +14,11 @@ def pprint_df(df, tablefmt='psql'):
     for dataset_name in sorted(all_datasets, key=lambda v: v.upper()):
         edge_raw_features = np.load('../processed_data/{}/ml_{}.npy'.format(dataset_name, dataset_name))
         node_raw_features = np.load('../processed_data/{}/ml_{}_node.npy'.format(dataset_name, dataset_name))
-        info = {'name': dataset_name,
+        info = {'dataset_name': dataset_name,
                 'num_nodes': node_raw_features.shape[0] - 1,
-                'node_fea_dim': node_raw_features.shape[-1],
+                'node_feat_dim': node_raw_features.shape[-1],
                 'num_edges': edge_raw_features.shape[0] - 1,
-                'edge_fea_dim': edge_raw_features.shape[-1]}
+                'edge_feat_dim': edge_raw_features.shape[-1]}
         records.append(info)
 
     info_df = pd.DataFrame.from_records(records)
diff --git a/preprocess_data/preprocess_all_data.py b/preprocess_data/preprocess_all_data.py
@@ -1,5 +1,5 @@
 import os
 
-for name in ['wikipedia', 'reddit', 'mooc', 'lastfm', 'enron', 'SocialEvo', 'myket',
+for name in ['wikipedia', 'reddit', 'mooc', 'lastfm', 'myket', 'enron', 'SocialEvo',
              'uci', 'Flights', 'CanParl', 'USLegis', 'UNtrade', 'UNvote', 'Contacts']:
     os.system(f'python preprocess_data.py  --dataset_name {name}')
diff --git a/preprocess_data/preprocess_data.py b/preprocess_data/preprocess_data.py
@@ -163,8 +163,7 @@ def check_data(dataset_name: str):
 if args.dataset_name in ['enron', 'SocialEvo', 'uci']:
     Path("../processed_data/{}/".format(args.dataset_name)).mkdir(parents=True, exist_ok=True)
     copy_tree("../DG_data/{}/".format(args.dataset_name), "../processed_data/{}/".format(args.dataset_name))
-    print(
-        f'the original dataset of {args.dataset_name} is unavailable, directly use the processed dataset by previous works.')
+    print(f'the original dataset of {args.dataset_name} is unavailable, directly use the processed dataset by previous works.')
 else:
     # bipartite dataset
     if args.dataset_name in ['wikipedia', 'reddit', 'mooc', 'lastfm', 'myket']: