18
18
params = yaml .safe_load (open ('params.yaml' ))['prepare' ]
19
19
upper_limit = params ['upper_limit' ]
20
20
lower_limit = params ['lower_limit' ]
21
+ upper_limit_arules = params ['upper_limit_arules' ]
22
+ lower_limit_arules = params ['lower_limit_arules' ]
21
23
22
24
input_file = Path (sys .argv [1 ]) #'reddit_scrapper/data/scrapped_data.json'
23
25
input_index = Path (sys .argv [2 ]) #'reddit_scrapper/data/list_of_unique_subreddits.json'
24
- #Path('prepared').mkdir(parents=True, exist_ok=True)
26
+ Path ('prepared' ).mkdir (parents = True , exist_ok = True )
27
+ Path ('target' ).mkdir (parents = True , exist_ok = True )
25
28
data = json .load (open (input_file ,'r+' ))
26
29
subreddit_names_list = json .load (open (input_index ,'r+' ))
27
30
subreddit_index = dict (zip (subreddit_names_list ,range (len (subreddit_names_list ))))
@@ -61,13 +64,14 @@ def extract_most_popular_subreddits(df,lower_limit,upper_limit,clear_zero_rows=T
61
64
62
65
63
66
matrix = create_matrix (data ,len (subreddit_names_list ),subreddit_index )
64
- print (matrix .shape )
65
67
df = filter_matrix (matrix ,5 ,index_subreddit )
66
68
del matrix
69
+ df .rename (columns = index_subreddit ,inplace = True )
70
+ most_popular_reddits = df .sum (axis = 0 ).sort_values (ascending = False )[lower_limit_arules :upper_limit_arules ].index
71
+ df_bool = df .loc [:,most_popular_reddits ].astype (bool ).astype (int )
67
72
df = extract_most_popular_subreddits (df ,lower_limit ,upper_limit )
73
+ print (df .shape )
68
74
df .to_csv ('prepared/matrix.csv' , index = False )
69
75
print ("Almost done..." )
70
- #df = filter_matrix(matrix,2,index_subreddit)
71
- df = df .astype (bool ).astype (int )
72
- df .rename (columns = index_subreddit ,inplace = True )
73
- df .to_csv ('prepared/matrix_bool.csv' , index = False )
76
+ print (df_bool .shape )
77
+ df_bool .to_csv ('prepared/matrix_bool.csv' , index = False )
0 commit comments