1
- import mlxtend as mlx
2
- from tqdm .notebook import tqdm , trange
3
- from itertools import chain ,product
4
- import pandas as pd
5
- import numpy as np
6
- from sklearn .cluster import DBSCAN
7
- import json
8
- from sklearn .metrics import silhouette_samples , silhouette_score
9
- from sklearn .manifold import TSNE
10
- import matplotlib .pyplot as plt
11
- import matplotlib .cm as cm
12
- import plotly .express as px
13
- import sys
14
- import yaml
15
- from pathlib import Path
16
-
17
- params = yaml .safe_load (open ('params.yaml' ))['prepare' ]
18
- upper_limit = params ['upper_limit' ]
19
- lower_limit = params ['lower_limit' ]
20
-
21
- input_file = Path (sys .argv [1 ]) #'reddit_scrapper/data/scrapped_data.json'
22
- input_index = Path (sys .argv [2 ]) #'reddit_scrapper/data/list_of_unique_subreddits.json'
23
- int_output = Path ('data' ) / 'matrix.csv'
24
- bool_output = Path ('data' ) / 'matrix_bool.csv'
25
- data = json .load (open ('reddit_scrapper/data/scrapped_data.json' ,'r+' ))
26
- subreddit_names_list = json .load (open ('reddit_scrapper/data/list_of_unique_subreddits.json' ,'r+' ))
27
- subreddit_index = dict (zip (subreddit_names_list ,range (len (subreddit_names_list ))))
28
- index_subreddit = dict (zip (range (len (subreddit_names_list )),subreddit_names_list ))
29
-
30
- def create_matrix (data ,matrix_width ,subreddit_index ):
31
- """ Creates matrix filled with zeros and iterates over it filling the cells based on
32
- the subreddit-index dictionary"""
33
- matrix = np .zeros (shape = (len (data ),matrix_width ))
34
- for idx ,redditor in enumerate (data .values ()):
35
- for key ,value in redditor .items ():
36
- matrix [idx ,subreddit_index [key ]] = value
37
- return matrix
38
-
39
- def filter_matrix (matrix ,threshold ,index_subreddit ):
40
- mask = np .where (matrix > threshold ,True ,False )
41
- rows = ~ np .all (mask == False ,axis = 1 )
42
- columns = ~ np .all (mask == False ,axis = 0 )
43
- del mask
44
- data = matrix [np .ix_ (rows ,columns )]
45
- del rows
46
- df = pd .DataFrame (data ,columns = np .squeeze (np .argwhere (columns )))
47
- del data ,columns
48
- df .rename (columns = index_subreddit ,inplace = True )
49
- return df
50
-
51
- def extract_most_popular_subreddits (df ,lower_limit ,upper_limit ):
52
- most_popular_reddits = df .sum (axis = 0 ).sort_values (ascending = False )[lower_limit :upper_limit ].index
53
- column_base_order = dict (zip (df .columns ,range (len (df .columns ))))
54
- column_indexes = [column_base_order [i ] for i in most_popular_reddits ]
55
- X_np = df .to_numpy ()[:, column_indexes ]
56
- del df ,column_base_order ,column_indexes
57
- zero_rows = np .where (X_np .sum (axis = 1 ) == 0 )[0 ]
58
- X_np = np .delete (X_np , zero_rows , axis = 0 )
59
- return pd .DataFrame (X_np ,columns = most_popular_reddits ).drop_duplicates ()
60
-
61
- matrix = create_matrix (data ,len (subreddit_names_list ),subreddit_index )
62
- df = filter_matrix (matrix ,5 ,index_subreddit )
63
- df_bool = df .astype (bool ).astype (int )
64
- df = extract_most_popular_subreddits (df ,lower_limit ,upper_limit )
65
- df .to_csv (int_output , header = None )
66
-
67
- mask = np .where (matrix > 2 ,True ,False )
68
- rows = ~ np .all (mask == False ,axis = 1 )
69
- columns = ~ np .all (mask == False ,axis = 0 )
70
- del mask
71
- data = matrix [np .ix_ (rows ,columns )]
72
- df = pd .DataFrame (data ,columns = np .squeeze (np .argwhere (columns )))
73
- del rows
74
- del columns
75
- del data
76
- del matrix
77
- df .rename (columns = index_subreddit ,inplace = True )
78
- df .to_csv (bool_output , header = None )
1
+ import mlxtend as mlx
2
+ from tqdm .notebook import tqdm , trange
3
+ from itertools import chain ,product
4
+ import pandas as pd
5
+ import numpy as np
6
+ from sklearn .cluster import DBSCAN
7
+ import json
8
+ from sklearn .metrics import silhouette_samples , silhouette_score
9
+ from sklearn .manifold import TSNE
10
+ import matplotlib .pyplot as plt
11
+ import matplotlib .cm as cm
12
+ import plotly .express as px
13
+ import sys
14
+ import yaml
15
+ from pathlib import Path
16
+
17
+ params = yaml .safe_load (open ('params.yaml' ))['prepare' ]
18
+ upper_limit = params ['upper_limit' ]
19
+ lower_limit = params ['lower_limit' ]
20
+
21
+ input_file = Path (sys .argv [1 ]) #'reddit_scrapper/data/scrapped_data.json'
22
+ input_index = Path (sys .argv [2 ]) #'reddit_scrapper/data/list_of_unique_subreddits.json'
23
+ #Path('prepared').mkdir(parents=True, exist_ok=True)
24
+ data = json .load (open (input_file ,'r+' ))
25
+ subreddit_names_list = json .load (open (input_index ,'r+' ))
26
+ subreddit_index = dict (zip (subreddit_names_list ,range (len (subreddit_names_list ))))
27
+ index_subreddit = dict (zip (range (len (subreddit_names_list )),subreddit_names_list ))
28
+
29
+ def create_matrix (data ,matrix_width ,subreddit_index ):
30
+ """ Creates matrix filled with zeros and iterates over it filling the cells based on
31
+ the subreddit-index dictionary"""
32
+ matrix = np .zeros (shape = (len (data ),matrix_width ))
33
+ for idx ,redditor in enumerate (data .values ()):
34
+ for key ,value in redditor .items ():
35
+ matrix [idx ,subreddit_index [key ]] = value
36
+ return matrix
37
+
38
+ def filter_matrix (matrix ,threshold ,index_subreddit ):
39
+ mask = np .where (matrix > threshold ,True ,False )
40
+ rows = ~ np .all (mask == False ,axis = 1 )
41
+ columns = ~ np .all (mask == False ,axis = 0 )
42
+ del mask
43
+ data = matrix [np .ix_ (rows ,columns )]
44
+ del rows
45
+ df = pd .DataFrame (data ,columns = np .squeeze (np .argwhere (columns )))
46
+ del data ,columns
47
+ df .rename (columns = index_subreddit ,inplace = True )
48
+ return df
49
+
50
+ def extract_most_popular_subreddits (df ,lower_limit ,upper_limit ):
51
+ most_popular_reddits = df .sum (axis = 0 ).sort_values (ascending = False )[lower_limit :upper_limit ].index
52
+ column_base_order = dict (zip (df .columns ,range (len (df .columns ))))
53
+ column_indexes = [column_base_order [i ] for i in most_popular_reddits ]
54
+ X_np = df .to_numpy ()[:, column_indexes ]
55
+ del df ,column_base_order ,column_indexes
56
+ zero_rows = np .where (X_np .sum (axis = 1 ) == 0 )[0 ]
57
+ X_np = np .delete (X_np , zero_rows , axis = 0 )
58
+ return pd .DataFrame (X_np ,columns = most_popular_reddits ).drop_duplicates ()
59
+
60
+ matrix = create_matrix (data ,len (subreddit_names_list ),subreddit_index )
61
+ df = filter_matrix (matrix ,5 ,index_subreddit )
62
+ df = extract_most_popular_subreddits (df ,lower_limit ,upper_limit )
63
+ df .to_csv ('prepared/matrix.csv' , header = None )
64
+ print ("Almost done..." )
65
+ #df = filter_matrix(matrix,2,index_subreddit)
66
+ df = df .astype (bool ).astype (int )
67
+ df .rename (columns = index_subreddit ,inplace = True )
68
+ df .to_csv ('prepared/matrix_bool.csv' , header = None )
0 commit comments