Skip to content

Commit 2125620

Browse files
authored
DVC increased matrix_bool size
1 parent f1abccc commit 2125620

File tree

2 files changed

+12
-7
lines changed

2 files changed

+12
-7
lines changed

params.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
prepare:
22
upper_limit: 2000
33
lower_limit: 3
4+
upper_limit_arules: 10000
5+
lower_limit_arules: 20
46
generate_association_rules_final:
5-
upper_limit: 5000
67
min_support: 0.00035
78
cluster_TSNA:
89
n_components: 3

prepare.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,13 @@
1818
params = yaml.safe_load(open('params.yaml'))['prepare']
1919
upper_limit = params['upper_limit']
2020
lower_limit = params['lower_limit']
21+
upper_limit_arules = params['upper_limit_arules']
22+
lower_limit_arules = params['lower_limit_arules']
2123

2224
input_file = Path(sys.argv[1]) #'reddit_scrapper/data/scrapped_data.json'
2325
input_index = Path(sys.argv[2]) #'reddit_scrapper/data/list_of_unique_subreddits.json'
24-
#Path('prepared').mkdir(parents=True, exist_ok=True)
26+
Path('prepared').mkdir(parents=True, exist_ok=True)
27+
Path('target').mkdir(parents=True, exist_ok=True)
2528
data = json.load(open(input_file,'r+'))
2629
subreddit_names_list = json.load(open(input_index,'r+'))
2730
subreddit_index = dict(zip(subreddit_names_list,range(len(subreddit_names_list))))
@@ -61,13 +64,14 @@ def extract_most_popular_subreddits(df,lower_limit,upper_limit,clear_zero_rows=T
6164

6265

6366
matrix = create_matrix(data,len(subreddit_names_list),subreddit_index)
64-
print(matrix.shape)
6567
df = filter_matrix(matrix,5,index_subreddit)
6668
del matrix
69+
df.rename(columns=index_subreddit,inplace=True)
70+
most_popular_reddits = df.sum(axis=0).sort_values(ascending=False)[lower_limit_arules:upper_limit_arules].index
71+
df_bool = df.loc[:,most_popular_reddits].astype(bool).astype(int)
6772
df = extract_most_popular_subreddits(df,lower_limit,upper_limit)
73+
print(df.shape)
6874
df.to_csv('prepared/matrix.csv', index=False)
6975
print("Almost done...")
70-
#df = filter_matrix(matrix,2,index_subreddit)
71-
df = df.astype(bool).astype(int)
72-
df.rename(columns=index_subreddit,inplace=True)
73-
df.to_csv('prepared/matrix_bool.csv', index=False)
76+
print(df_bool.shape)
77+
df_bool.to_csv('prepared/matrix_bool.csv', index=False)

0 commit comments

Comments
 (0)