Skip to content

Commit 6dae7a1

Browse files
authored
DVC final (I hope)
1 parent 7bccd3b commit 6dae7a1

7 files changed

+92
-22
lines changed

cluster_PCA.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from sklearn.cluster import KMeans
44
import pandas as pd
55
import numpy as np
6+
import pickle
67
import sys
78
import yaml
89
from pathlib import Path
@@ -22,4 +23,11 @@
2223
pca_df['clustering'] = clustering.labels_
2324
pca_df['clustering'] = pca_df['clustering'].astype(str)
2425

25-
pca_df.to_csv('target/pca.csv', header=None)
26+
pca_df.to_csv('target/pca.csv', index=False)
27+
28+
outfile = open('target/pca.pkl','wb')
29+
pickle.dump(pca,outfile)
30+
outfile.close()
31+
outfile = open('target/clustering.pkl','wb')
32+
pickle.dump(clustering,outfile)
33+
outfile.close()

cluster_TSNA.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,20 @@
2121
input_file = sys.argv[1]
2222
#Path(output_dir).mkdir(exist_ok=True)
2323
df = pd.read_csv(input_file, sep=',')
24-
24+
df = df.iloc[:-3,:]
2525
tsne = TSNE(n_components=n_components,n_jobs=n_jobs,random_state=random_state)
2626
#X_tsne = tsne.fit_transform(df)
2727
X_tsne = pd.read_csv('X_tsneL=3U=2000.csv')
28+
X_tsne.drop(columns=['Unnamed: 0'],inplace=True)
29+
2830
clustering = DBSCAN(eps=2, min_samples=8,n_jobs=-1).fit(X_tsne)
2931
X_tsne = pd.DataFrame(X_tsne,columns=['component1','component2','component3'])
3032
X_tsne['clustering'] = clustering.labels_
3133
X_tsne['clustering'] = X_tsne['clustering'].astype(str)
3234
X_tsne = X_tsne[X_tsne['clustering'] !='-1']
3335

3436
clustered_useres_dicts = {}
37+
print(df.shape, X_tsne.shape)
3538
df.loc[:,'clustering'] = clustering.labels_
3639
clustered_users = df.groupby(by=df['clustering']).sum()
3740
clustered_users_matrix = clustered_users.to_numpy().astype(int)
@@ -49,4 +52,4 @@
4952
pickle.dump(X_tsne,outfile)
5053
outfile.close()
5154

52-
clustered_users.to_csv('target/tsna.csv', header=None)
55+
clustered_users.to_csv('target/clustered_users.csv')

dvc.yaml

+6-3
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ stages:
4040
- cluster_TSNA.n_jobs
4141
- cluster_TSNA.random_state
4242
outs:
43-
- target/tsna.csv
43+
- target/clustered_users.csv
4444
- target/tsna.pkl
4545
scrape_user:
4646
cmd: python scrape_user.py
@@ -49,9 +49,12 @@ stages:
4949
outs:
5050
- target/user.json
5151
recommend:
52-
cmd: python recommend.py target/arules-10000-00035.json target/user.json target/pca.csv
52+
cmd: python recommend.py target/arules-10000-00035.json target/user.json target/pca.csv prepared/matrix.csv target/pca.pkl target/clustering.pkl
5353
deps:
5454
- recommend.py
5555
- target/arules-10000-00035.json
5656
- target/user.json
57-
- target/pca.csv
57+
- target/pca.csv
58+
- prepared/matrix.csv
59+
- target/pca.pkl
60+
- target/clustering.pkl

generate_association_rules_final.py

-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
#Path(output_dir).mkdir(exist_ok=True)
2020

2121
df_bool = pd.read_csv(input_path, sep=',')
22-
df_bool = df_bool.iloc[: , 1:]
2322
frequent_itemsets = apriori(df_bool, min_support=min_support,
2423
use_colnames=True, low_memory=True,
2524
verbose=2, max_len=7)

params.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@ cluster_PCA:
1212
n_components: 200
1313
n_clusters: 700
1414
scrape_user:
15-
username: FirstGalacticEmpire
15+
username: FirstGalacticEmpire

prepare.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import plotly.express as px
1313
import sys
1414
import yaml
15+
1516
from pathlib import Path
1617

1718
params = yaml.safe_load(open('params.yaml'))['prepare']
@@ -34,8 +35,7 @@ def create_matrix(data,matrix_width,subreddit_index):
3435
for key,value in redditor.items():
3536
matrix[idx,subreddit_index[key]] = value
3637
return matrix
37-
38-
def filter_matrix(matrix,threshold,index_subreddit):
38+
def filter_matrix(matrix,threshold,indexsubreddit):
3939
mask = np.where(matrix>threshold,True,False)
4040
rows = ~np.all(mask==False,axis=1)
4141
columns = ~np.all(mask==False,axis=0)
@@ -46,23 +46,28 @@ def filter_matrix(matrix,threshold,index_subreddit):
4646
del data,columns
4747
df.rename(columns=index_subreddit,inplace=True)
4848
return df
49-
50-
def extract_most_popular_subreddits(df,lower_limit,upper_limit):
49+
def extract_most_popular_subreddits(df,lower_limit,upper_limit,clear_zero_rows=True):
5150
most_popular_reddits = df.sum(axis=0).sort_values(ascending=False)[lower_limit:upper_limit].index
5251
column_base_order = dict(zip(df.columns,range(len(df.columns))))
5352
column_indexes = [column_base_order[i] for i in most_popular_reddits]
5453
X_np = df.to_numpy()[:, column_indexes]
5554
del df,column_base_order,column_indexes
5655
zero_rows = np.where(X_np.sum(axis=1) == 0)[0]
5756
X_np= np.delete(X_np, zero_rows, axis=0)
58-
return pd.DataFrame(X_np,columns=most_popular_reddits).drop_duplicates()
57+
if clear_zero_rows:
58+
return pd.DataFrame(X_np,columns=most_popular_reddits).drop_duplicates()
59+
else:
60+
return pd.DataFrame(df,columns=most_popular_reddits).drop_duplicates()
61+
5962

6063
matrix = create_matrix(data,len(subreddit_names_list),subreddit_index)
64+
print(matrix.shape)
6165
df = filter_matrix(matrix,5,index_subreddit)
66+
del matrix
6267
df = extract_most_popular_subreddits(df,lower_limit,upper_limit)
63-
df.to_csv('prepared/matrix.csv', header=None)
68+
df.to_csv('prepared/matrix.csv', index=False)
6469
print("Almost done...")
6570
#df = filter_matrix(matrix,2,index_subreddit)
6671
df = df.astype(bool).astype(int)
6772
df.rename(columns=index_subreddit,inplace=True)
68-
df.to_csv('prepared/matrix_bool.csv', header=None)
73+
df.to_csv('prepared/matrix_bool.csv', index=False)

recommend.py

+59-7
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,32 @@
33
import json
44
import pickle
55
import yaml
6+
import numpy as np
67
from pathlib import Path
8+
from sklearn.neighbors import BallTree
79

8-
user = None
10+
params = yaml.safe_load(open('params.yaml'))['scrape_user']
11+
username = params['username']
912

1013

1114

1215
rules = pd.read_json(sys.argv[1])
13-
user = json.load(sys.argv[2])
14-
tsna = pd.read_csv(sys.argv[3], sep=',')
15-
infile = open(sys.argv[4],'rb')
16-
tsna_model = pickle.load(infile)
16+
f = open(sys.argv[2])
17+
user = json.load(f)
18+
f.close()
19+
#tsna = pd.read_csv(sys.argv[3], sep=',')
20+
#infile = open(sys.argv[4],'rb')
21+
#tsna_model = pickle.load(infile)
22+
#infile.close()
23+
pca_df = pd.read_csv(sys.argv[3], sep=',')
24+
df = pd.read_csv(sys.argv[4], sep=',')
25+
infile = open(sys.argv[5],'rb')
26+
pca = pickle.load(infile)
1727
infile.close()
18-
pca = pd.read_csv(sys.argv[5], sep=',')
28+
infile = open(sys.argv[6],'rb')
29+
clustering = pickle.load(infile)
30+
infile.close()
31+
1932

2033
rules["antecedents"] = rules["antecedents"].apply(lambda x: frozenset(x))
2134
rules["consequents"] = rules["consequents"].apply(lambda x: frozenset(x))
@@ -42,4 +55,43 @@ def get_rules(set_of_subreddits, top_n=100):
4255
print("User likes:", sub_red)
4356
print("User should like:",get_rules(sub_red,10))
4457

45-
user2 = {'user' : user}
58+
59+
60+
user = {username : user}
61+
def reshape_new_user(new_user,df):
62+
base_dict = dict(zip(df.columns,[0]*df.columns.shape[0]))
63+
for key,value in list(new_user.values())[0].items():
64+
if key in base_dict:
65+
base_dict[key] = value
66+
base_dict ={list(new_user.keys())[0]:base_dict}
67+
new_user_df = pd.DataFrame(base_dict).T
68+
new_user_df = new_user_df[df.columns.to_list()]
69+
return new_user_df
70+
71+
def get_cluster_index(pca_new_user,pca_df,NN=10):
72+
tree = BallTree(pca_df.iloc[:,:-1])
73+
dist, ind = tree.query(pca_new_user,k=NN)
74+
new_user_cluster = pca_df.iloc[ind[0],pca_df.columns.get_loc("clustering")]\
75+
.value_counts().sort_values(ascending=False).index[0]
76+
return int(new_user_cluster)
77+
78+
def get_clustered_subreddits(df,labels):
79+
clustered_useres_dicts = {}
80+
df.loc[:,'clustering'] = clustering.labels_
81+
clustered_users = df.groupby(by=df['clustering']).sum()
82+
clustered_users_matrix = clustered_users.to_numpy().astype(int)
83+
for i in range(clustered_users.to_numpy().shape[0]):
84+
mask = np.where(clustered_users_matrix[i,:] >0,True,False)
85+
clustered_useres_dicts[clustered_users.iloc[i].name] = \
86+
dict(zip(clustered_users.columns[mask],clustered_users_matrix[i,:][mask]))
87+
df.drop(columns=['clustering'],inplace=True)
88+
return clustered_useres_dicts
89+
90+
new_user_df = reshape_new_user(user,df)
91+
pca_new_user = pca.transform(new_user_df)
92+
new_user_cluster_index = get_cluster_index(pca_new_user,pca_df)
93+
clustered_useres_dicts = get_clustered_subreddits(df,clustering.labels_)
94+
user_cluster = clustered_useres_dicts[new_user_cluster_index]
95+
user_cluster = {i[0]:i[1] for i in user_cluster.items() if i[0] not in list(user.values())[0].keys()}
96+
print(new_user_cluster_index,user,pd.Series(user_cluster).sort_values(ascending=False).head(20))
97+

0 commit comments

Comments
 (0)