Skip to content

Commit 3a4bc00

Browse files
committed
chore(preprocess): convenient preprocess; better statistics displaying
i) preprocess all datasets with one line `python preprocess_all_data.py`. ii) display the data statistic in a more concise table way
1 parent 995715c commit 3a4bc00

File tree

2 files changed

+7
-4
lines changed

2 files changed

+7
-4
lines changed

preprocess_data/preprocess_data.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,9 @@ def check_data(dataset_name: str):
151151

152152

153153
parser = argparse.ArgumentParser('Interface for preprocessing datasets')
154-
parser.add_argument('--dataset_name', type=str, choices=['wikipedia', 'reddit', 'mooc', 'lastfm', 'myket', 'enron', 'SocialEvo', 'uci',
155-
'Flights', 'CanParl', 'USLegis', 'UNtrade', 'UNvote', 'Contacts'],
154+
parser.add_argument('--dataset_name', type=str,
155+
choices=['wikipedia', 'reddit', 'mooc', 'lastfm', 'myket', 'enron', 'SocialEvo', 'uci',
156+
'Flights', 'CanParl', 'USLegis', 'UNtrade', 'UNvote', 'Contacts'],
156157
help='Dataset name', default='wikipedia')
157158
parser.add_argument('--node_feat_dim', type=int, default=172, help='Number of node raw features')
158159

@@ -162,7 +163,8 @@ def check_data(dataset_name: str):
162163
if args.dataset_name in ['enron', 'SocialEvo', 'uci']:
163164
Path("../processed_data/{}/".format(args.dataset_name)).mkdir(parents=True, exist_ok=True)
164165
copy_tree("../DG_data/{}/".format(args.dataset_name), "../processed_data/{}/".format(args.dataset_name))
165-
print(f'the original dataset of {args.dataset_name} is unavailable, directly use the processed dataset by previous works.')
166+
print(
167+
f'the original dataset of {args.dataset_name} is unavailable, directly use the processed dataset by previous works.')
166168
else:
167169
# bipartite dataset
168170
if args.dataset_name in ['wikipedia', 'reddit', 'mooc', 'lastfm', 'myket']:
@@ -171,5 +173,6 @@ def check_data(dataset_name: str):
171173
preprocess_data(dataset_name=args.dataset_name, bipartite=False, node_feat_dim=args.node_feat_dim)
172174
print(f'{args.dataset_name} is processed successfully.')
173175

174-
check_data(args.dataset_name)
176+
if args.dataset_name not in ['myket']:
177+
check_data(args.dataset_name)
175178
print(f'{args.dataset_name} passes the checks successfully.')
19.1 MB
Binary file not shown.

0 commit comments

Comments
 (0)