Skip to content

Commit 28983b5

Browse files
authored
DVC pipeline #2
TSNA does not work yet and recommend is unfinished
1 parent 59c9b27 commit 28983b5

8 files changed

+337
-81
lines changed

cluster_PCA.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from sklearn.decomposition import PCA
2+
from sklearn.neighbors import BallTree
3+
from sklearn.cluster import KMeans
4+
import pandas as pd
5+
import numpy as np
6+
import sys
7+
import yaml
8+
from pathlib import Path
9+
10+
params = yaml.safe_load(open('params.yaml'))['cluster_PCA']
11+
n_components = params['n_components']
12+
n_clusters = params['n_clusters']
13+
input_file = sys.argv[1]
14+
Path('target').mkdir(exist_ok=True)
15+
df = pd.read_csv(input_file, sep=',')
16+
17+
pca = PCA(n_components=n_components).fit(df)
18+
pca_df = pd.DataFrame(pca.transform(df))
19+
20+
pca_df['clustering'] = None
21+
clustering = KMeans(n_clusters=n_clusters).fit(pca_df.iloc[:,:-1])
22+
pca_df['clustering'] = clustering.labels_
23+
pca_df['clustering'] = pca_df['clustering'].astype(str)
24+
25+
pca_df.to_csv('target/pca.csv', header=None)

cluster_TSNA.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from sklearn.metrics import silhouette_samples, silhouette_score
2+
from sklearn.manifold import TSNE
3+
from sklearn.cluster import DBSCAN
4+
import matplotlib.pyplot as plt
5+
import matplotlib.cm as cm
6+
from itertools import product
7+
import plotly.express as px
8+
from tqdm.notebook import tqdm
9+
import pandas as pd
10+
import numpy as np
11+
import sys
12+
import yaml
13+
from pathlib import Path
14+
import pickle
15+
16+
params = yaml.safe_load(open('params.yaml'))['cluster_TSNA']
17+
n_components = params['n_components']
18+
n_jobs = params['n_jobs']
19+
random_state = params['random_state']
20+
21+
input_file = sys.argv[1]
22+
#Path(output_dir).mkdir(exist_ok=True)
23+
df = pd.read_csv(input_file, sep=',')
24+
25+
tsne = TSNE(n_components=n_components,n_jobs=n_jobs,random_state=random_state)
26+
#X_tsne = tsne.fit_transform(df)
27+
X_tsne = pd.read_csv('X_tsneL=3U=2000.csv')
28+
clustering = DBSCAN(eps=2, min_samples=8,n_jobs=-1).fit(X_tsne)
29+
X_tsne = pd.DataFrame(X_tsne,columns=['component1','component2','component3'])
30+
X_tsne['clustering'] = clustering.labels_
31+
X_tsne['clustering'] = X_tsne['clustering'].astype(str)
32+
X_tsne = X_tsne[X_tsne['clustering'] !='-1']
33+
34+
clustered_useres_dicts = {}
35+
df.loc[:,'clustering'] = clustering.labels_
36+
clustered_users = df.groupby(by=df['clustering']).sum()
37+
clustered_users_matrix = clustered_users.to_numpy().astype(int)
38+
for i in range(clustered_users.to_numpy().shape[0]):
39+
mask = np.where(clustered_users_matrix[i,:] >0,True,False)
40+
clustered_useres_dicts[clustered_users.iloc[i].name] = \
41+
dict(zip(clustered_users.columns[mask],clustered_users_matrix[i,:][mask]))
42+
43+
#for i in clustered_useres_dicts:
44+
# x = clustered_useres_dicts[i]
45+
# print(sorted(x.items(),key=lambda item: item[1],reverse=True)[:5])
46+
47+
clustered_useres_dicts[df.loc[5,"clustering"]]
48+
outfile = open('target/tsna.pkl','wb')
49+
pickle.dump(X_tsne,outfile)
50+
outfile.close()
51+
52+
clustered_users.to_csv('target/tsna.csv', header=None)

dvc.yaml

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
stages:
2+
prepare:
3+
cmd: python prepare.py reddit_scrapper/data/scrapped_data.json reddit_scrapper/data/list_of_unique_subreddits.json
4+
deps:
5+
- prepare.py
6+
- reddit_scrapper/data/list_of_unique_subreddits.json
7+
- reddit_scrapper/data/scrapped_data.json
8+
params:
9+
- prepare.lower_limit
10+
- prepare.upper_limit
11+
outs:
12+
- prepared/matrix.csv
13+
- prepared/matrix_bool.csv
14+
generate_association_rules_final:
15+
cmd: python generate_association_rules_final.py prepared/matrix_bool.csv
16+
deps:
17+
- generate_association_rules_final.py
18+
- prepared/matrix_bool.csv
19+
params:
20+
- generate_association_rules_final.min_support
21+
outs:
22+
- target/arules-10000-00035.json
23+
cluster_PCA:
24+
cmd: python cluster_PCA.py prepared/matrix.csv
25+
deps:
26+
- cluster_PCA.py
27+
- prepared/matrix.csv
28+
params:
29+
- cluster_PCA.n_components
30+
- cluster_PCA.n_clusters
31+
outs:
32+
- target/pca.csv
33+
cluster_TSNA:
34+
cmd: python cluster_TSNA.py prepared/matrix.csv
35+
deps:
36+
- cluster_TSNA.py
37+
- prepared/matrix.csv
38+
params:
39+
- cluster_TSNA.n_components
40+
- cluster_TSNA.n_jobs
41+
- cluster_TSNA.random_state
42+
outs:
43+
- target/tsna.csv
44+
- target/tsna.pkl
45+
scrape_user:
46+
cmd: python scrape_user.py
47+
params:
48+
- scrape_user.username
49+
outs:
50+
- target/user.json
51+
recommend:
52+
cmd: python recommend.py target/arules-10000-00035.json target/user.json target/tsna.csv target/tsna.pkl target/pca.csv
53+
deps:
54+
- recommend.py
55+
- target/arules-10000-00035.json
56+
- target/user.json
57+
- target/tsna.csv
58+
- target/tsna.pkl
59+
- target/pca.csv

generate_association_rules_final.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import json
2+
import numpy as np
3+
import pandas as pd
4+
import plotly.express as px
5+
import mlxtend as mlx
6+
from tqdm.notebook import tqdm, trange
7+
from itertools import chain
8+
import time
9+
from mlxtend.preprocessing import TransactionEncoder
10+
from mlxtend.frequent_patterns import apriori
11+
from mlxtend.frequent_patterns import association_rules
12+
import yaml
13+
import sys
14+
from pathlib import Path
15+
16+
params = yaml.safe_load(open('params.yaml'))['generate_association_rules_final']
17+
min_support = params['min_support']
18+
input_path = sys.argv[1]
19+
#Path(output_dir).mkdir(exist_ok=True)
20+
21+
df_bool = pd.read_csv(input_path, sep=',')
22+
df_bool = df_bool.iloc[: , 1:]
23+
frequent_itemsets = apriori(df_bool, min_support=min_support,
24+
use_colnames=True, low_memory=True,
25+
verbose=2, max_len=7)
26+
rules = association_rules(frequent_itemsets,
27+
metric='lift',
28+
min_threshold=1.01)
29+
del frequent_itemsets
30+
rules.to_json("target/arules-10000-00035.json")
31+
del rules
32+
33+
34+
35+
#frequent_itemsets = apriori(df_bool, min_support=min_support, use_colnames=True)
36+
#rules = association_rules(frequent_itemsets,
37+
# metric='confidence',
38+
# min_threshold=0.7)
39+
#
40+
#rules.to_json(output_dir + "arules.json")

params.yaml

+15-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1-
prepare:
2-
upper_limit: 2000
3-
lower_limit: 3
1+
prepare:
2+
upper_limit: 2000
3+
lower_limit: 3
4+
generate_association_rules_final:
5+
upper_limit: 5000
6+
min_support: 0.00035
7+
cluster_TSNA:
8+
n_components: 3
9+
n_jobs: -1
10+
random_state: 42
11+
cluster_PCA:
12+
n_components: 200
13+
n_clusters: 700
14+
scrape_user:
15+
username: FirstGalacticEmpire

prepare.py

+68-78
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,68 @@
1-
import mlxtend as mlx
2-
from tqdm.notebook import tqdm, trange
3-
from itertools import chain,product
4-
import pandas as pd
5-
import numpy as np
6-
from sklearn.cluster import DBSCAN
7-
import json
8-
from sklearn.metrics import silhouette_samples, silhouette_score
9-
from sklearn.manifold import TSNE
10-
import matplotlib.pyplot as plt
11-
import matplotlib.cm as cm
12-
import plotly.express as px
13-
import sys
14-
import yaml
15-
from pathlib import Path
16-
17-
params = yaml.safe_load(open('params.yaml'))['prepare']
18-
upper_limit = params['upper_limit']
19-
lower_limit = params['lower_limit']
20-
21-
input_file = Path(sys.argv[1]) #'reddit_scrapper/data/scrapped_data.json'
22-
input_index = Path(sys.argv[2]) #'reddit_scrapper/data/list_of_unique_subreddits.json'
23-
int_output = Path('data') / 'matrix.csv'
24-
bool_output = Path('data') / 'matrix_bool.csv'
25-
data = json.load(open('reddit_scrapper/data/scrapped_data.json','r+'))
26-
subreddit_names_list = json.load(open('reddit_scrapper/data/list_of_unique_subreddits.json','r+'))
27-
subreddit_index = dict(zip(subreddit_names_list,range(len(subreddit_names_list))))
28-
index_subreddit = dict(zip(range(len(subreddit_names_list)),subreddit_names_list))
29-
30-
def create_matrix(data,matrix_width,subreddit_index):
31-
""" Creates matrix filled with zeros and iterates over it filling the cells based on
32-
the subreddit-index dictionary"""
33-
matrix = np.zeros(shape=(len(data),matrix_width))
34-
for idx,redditor in enumerate(data.values()):
35-
for key,value in redditor.items():
36-
matrix[idx,subreddit_index[key]] = value
37-
return matrix
38-
39-
def filter_matrix(matrix,threshold,index_subreddit):
40-
mask = np.where(matrix>threshold,True,False)
41-
rows = ~np.all(mask==False,axis=1)
42-
columns = ~np.all(mask==False,axis=0)
43-
del mask
44-
data = matrix[np.ix_(rows,columns)]
45-
del rows
46-
df = pd.DataFrame(data,columns=np.squeeze(np.argwhere(columns)))
47-
del data,columns
48-
df.rename(columns=index_subreddit,inplace=True)
49-
return df
50-
51-
def extract_most_popular_subreddits(df,lower_limit,upper_limit):
52-
most_popular_reddits = df.sum(axis=0).sort_values(ascending=False)[lower_limit:upper_limit].index
53-
column_base_order = dict(zip(df.columns,range(len(df.columns))))
54-
column_indexes = [column_base_order[i] for i in most_popular_reddits]
55-
X_np = df.to_numpy()[:, column_indexes]
56-
del df,column_base_order,column_indexes
57-
zero_rows = np.where(X_np.sum(axis=1) == 0)[0]
58-
X_np= np.delete(X_np, zero_rows, axis=0)
59-
return pd.DataFrame(X_np,columns=most_popular_reddits).drop_duplicates()
60-
61-
matrix = create_matrix(data,len(subreddit_names_list),subreddit_index)
62-
df = filter_matrix(matrix,5,index_subreddit)
63-
df_bool = df.astype(bool).astype(int)
64-
df = extract_most_popular_subreddits(df,lower_limit,upper_limit)
65-
df.to_csv(int_output, header=None)
66-
67-
mask = np.where(matrix>2,True,False)
68-
rows = ~np.all(mask==False,axis=1)
69-
columns = ~np.all(mask==False,axis=0)
70-
del mask
71-
data = matrix[np.ix_(rows,columns)]
72-
df = pd.DataFrame(data,columns=np.squeeze(np.argwhere(columns)))
73-
del rows
74-
del columns
75-
del data
76-
del matrix
77-
df.rename(columns=index_subreddit,inplace=True)
78-
df.to_csv(bool_output, header=None)
1+
import mlxtend as mlx
2+
from tqdm.notebook import tqdm, trange
3+
from itertools import chain,product
4+
import pandas as pd
5+
import numpy as np
6+
from sklearn.cluster import DBSCAN
7+
import json
8+
from sklearn.metrics import silhouette_samples, silhouette_score
9+
from sklearn.manifold import TSNE
10+
import matplotlib.pyplot as plt
11+
import matplotlib.cm as cm
12+
import plotly.express as px
13+
import sys
14+
import yaml
15+
from pathlib import Path
16+
17+
params = yaml.safe_load(open('params.yaml'))['prepare']
18+
upper_limit = params['upper_limit']
19+
lower_limit = params['lower_limit']
20+
21+
input_file = Path(sys.argv[1]) #'reddit_scrapper/data/scrapped_data.json'
22+
input_index = Path(sys.argv[2]) #'reddit_scrapper/data/list_of_unique_subreddits.json'
23+
#Path('prepared').mkdir(parents=True, exist_ok=True)
24+
data = json.load(open(input_file,'r+'))
25+
subreddit_names_list = json.load(open(input_index,'r+'))
26+
subreddit_index = dict(zip(subreddit_names_list,range(len(subreddit_names_list))))
27+
index_subreddit = dict(zip(range(len(subreddit_names_list)),subreddit_names_list))
28+
29+
def create_matrix(data,matrix_width,subreddit_index):
30+
""" Creates matrix filled with zeros and iterates over it filling the cells based on
31+
the subreddit-index dictionary"""
32+
matrix = np.zeros(shape=(len(data),matrix_width))
33+
for idx,redditor in enumerate(data.values()):
34+
for key,value in redditor.items():
35+
matrix[idx,subreddit_index[key]] = value
36+
return matrix
37+
38+
def filter_matrix(matrix,threshold,index_subreddit):
39+
mask = np.where(matrix>threshold,True,False)
40+
rows = ~np.all(mask==False,axis=1)
41+
columns = ~np.all(mask==False,axis=0)
42+
del mask
43+
data = matrix[np.ix_(rows,columns)]
44+
del rows
45+
df = pd.DataFrame(data,columns=np.squeeze(np.argwhere(columns)))
46+
del data,columns
47+
df.rename(columns=index_subreddit,inplace=True)
48+
return df
49+
50+
def extract_most_popular_subreddits(df,lower_limit,upper_limit):
51+
most_popular_reddits = df.sum(axis=0).sort_values(ascending=False)[lower_limit:upper_limit].index
52+
column_base_order = dict(zip(df.columns,range(len(df.columns))))
53+
column_indexes = [column_base_order[i] for i in most_popular_reddits]
54+
X_np = df.to_numpy()[:, column_indexes]
55+
del df,column_base_order,column_indexes
56+
zero_rows = np.where(X_np.sum(axis=1) == 0)[0]
57+
X_np= np.delete(X_np, zero_rows, axis=0)
58+
return pd.DataFrame(X_np,columns=most_popular_reddits).drop_duplicates()
59+
60+
matrix = create_matrix(data,len(subreddit_names_list),subreddit_index)
61+
df = filter_matrix(matrix,5,index_subreddit)
62+
df = extract_most_popular_subreddits(df,lower_limit,upper_limit)
63+
df.to_csv('prepared/matrix.csv', header=None)
64+
print("Almost done...")
65+
#df = filter_matrix(matrix,2,index_subreddit)
66+
df = df.astype(bool).astype(int)
67+
df.rename(columns=index_subreddit,inplace=True)
68+
df.to_csv('prepared/matrix_bool.csv', header=None)

recommend.py

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import pandas as pd
2+
import sys
3+
import json
4+
import pickle
5+
import yaml
6+
from pathlib import Path
7+
8+
user = None
9+
10+
11+
12+
rules = pd.read_json(sys.argv[1])
13+
user = json.load(sys.argv[2])
14+
tsna = pd.read_csv(sys.argv[3], sep=',')
15+
infile = open(sys.argv[4],'rb')
16+
tsna_model = pickle.load(infile)
17+
infile.close()
18+
pca = pd.read_csv(sys.argv[5], sep=',')
19+
20+
rules["antecedents"] = rules["antecedents"].apply(lambda x: frozenset(x))
21+
rules["consequents"] = rules["consequents"].apply(lambda x: frozenset(x))
22+
23+
def get_rules(set_of_subreddits, top_n=100):
24+
antecedents_rule = rules['antecedents'].apply(lambda x: set_of_subreddits.issuperset(x))
25+
new_rules = rules[antecedents_rule].copy()
26+
# new_rules["Coefficient"] = new_rules["confidence"] + new_rules["support"]
27+
new_rules["Coefficient"] = new_rules["lift"]
28+
# new_rules["Coefficient"] = new_rules["confidence"] - new_rules["support"] + new_rules["lift"] + new_rules["leverage"]+ new_rules["conviction"]
29+
# new_rules["consequents"] = new_rules["consequents"].apply(lambda x: x - set_of_subreddits - OBVIOUS_SUBREDDITS)
30+
new_rules["consequents"] = new_rules["consequents"].apply(lambda x: x - set_of_subreddits)
31+
new_rules = new_rules[new_rules["consequents"].apply(lambda x: len(x) > 0)]
32+
if len(new_rules) == 0:
33+
return []
34+
new_rules = new_rules[['consequents', "Coefficient"]]
35+
new_rules = new_rules.explode("consequents")
36+
new_rules["consequents"] = new_rules["consequents"].apply(lambda x:list(x)[0])
37+
new_rules = new_rules.groupby("consequents")["Coefficient"].max().reset_index()
38+
return list(new_rules.nlargest(top_n, "Coefficient")["consequents"])
39+
40+
41+
sub_red = {k for k,v in user.items()}
42+
print("User likes:", sub_red)
43+
print("User should like:",get_rules(sub_red,10))
44+
45+
user2 = {'user' : user}

0 commit comments

Comments
 (0)