Skip to content

Commit 48cb1d3

Browse files
committed
arules finding using pandas
1 parent de7ab4a commit 48cb1d3

File tree

4 files changed

+747
-8
lines changed

4 files changed

+747
-8
lines changed

PythonDB.ipynb

+327-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,329 @@
11
{
22
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import pandas as pd"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 2,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"rules = pd.read_json(\"data/arules.json\")\n",
19+
"rules[\"antecedents\"] = rules[\"antecedents\"].apply(lambda x: frozenset(x))\n",
20+
"rules[\"consequents\"] = rules[\"consequents\"].apply(lambda x: frozenset(x))"
21+
]
22+
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": 3,
26+
"metadata": {},
27+
"outputs": [
28+
{
29+
"data": {
30+
"text/html": [
31+
"<div>\n",
32+
"<style scoped>\n",
33+
" .dataframe tbody tr th:only-of-type {\n",
34+
" vertical-align: middle;\n",
35+
" }\n",
36+
"\n",
37+
" .dataframe tbody tr th {\n",
38+
" vertical-align: top;\n",
39+
" }\n",
40+
"\n",
41+
" .dataframe thead th {\n",
42+
" text-align: right;\n",
43+
" }\n",
44+
"</style>\n",
45+
"<table border=\"1\" class=\"dataframe\">\n",
46+
" <thead>\n",
47+
" <tr style=\"text-align: right;\">\n",
48+
" <th></th>\n",
49+
" <th>antecedents</th>\n",
50+
" <th>consequents</th>\n",
51+
" <th>antecedent support</th>\n",
52+
" <th>consequent support</th>\n",
53+
" <th>support</th>\n",
54+
" <th>confidence</th>\n",
55+
" <th>lift</th>\n",
56+
" <th>leverage</th>\n",
57+
" <th>conviction</th>\n",
58+
" </tr>\n",
59+
" </thead>\n",
60+
" <tbody>\n",
61+
" <tr>\n",
62+
" <th>0</th>\n",
63+
" <td>(IAmA)</td>\n",
64+
" <td>(AskReddit)</td>\n",
65+
" <td>0.217120</td>\n",
66+
" <td>0.480390</td>\n",
67+
" <td>0.127371</td>\n",
68+
" <td>0.586641</td>\n",
69+
" <td>1.221177</td>\n",
70+
" <td>0.023069</td>\n",
71+
" <td>1.257043</td>\n",
72+
" </tr>\n",
73+
" <tr>\n",
74+
" <th>1</th>\n",
75+
" <td>(fffffffuuuuuuuuuuuu)</td>\n",
76+
" <td>(IAmA)</td>\n",
77+
" <td>0.003648</td>\n",
78+
" <td>0.217120</td>\n",
79+
" <td>0.002052</td>\n",
80+
" <td>0.562500</td>\n",
81+
" <td>2.590737</td>\n",
82+
" <td>0.001260</td>\n",
83+
" <td>1.789441</td>\n",
84+
" </tr>\n",
85+
" <tr>\n",
86+
" <th>2</th>\n",
87+
" <td>(todayilearned)</td>\n",
88+
" <td>(IAmA)</td>\n",
89+
" <td>0.013818</td>\n",
90+
" <td>0.217120</td>\n",
91+
" <td>0.008072</td>\n",
92+
" <td>0.584158</td>\n",
93+
" <td>2.690491</td>\n",
94+
" <td>0.005072</td>\n",
95+
" <td>1.882641</td>\n",
96+
" </tr>\n",
97+
" <tr>\n",
98+
" <th>3</th>\n",
99+
" <td>(wikipedia)</td>\n",
100+
" <td>(IAmA)</td>\n",
101+
" <td>0.004378</td>\n",
102+
" <td>0.217120</td>\n",
103+
" <td>0.002280</td>\n",
104+
" <td>0.520833</td>\n",
105+
" <td>2.398831</td>\n",
106+
" <td>0.001330</td>\n",
107+
" <td>1.633837</td>\n",
108+
" </tr>\n",
109+
" <tr>\n",
110+
" <th>4</th>\n",
111+
" <td>(bestof)</td>\n",
112+
" <td>(IAmA)</td>\n",
113+
" <td>0.002782</td>\n",
114+
" <td>0.217120</td>\n",
115+
" <td>0.001687</td>\n",
116+
" <td>0.606557</td>\n",
117+
" <td>2.793655</td>\n",
118+
" <td>0.001083</td>\n",
119+
" <td>1.989821</td>\n",
120+
" </tr>\n",
121+
" <tr>\n",
122+
" <th>...</th>\n",
123+
" <td>...</td>\n",
124+
" <td>...</td>\n",
125+
" <td>...</td>\n",
126+
" <td>...</td>\n",
127+
" <td>...</td>\n",
128+
" <td>...</td>\n",
129+
" <td>...</td>\n",
130+
" <td>...</td>\n",
131+
" <td>...</td>\n",
132+
" </tr>\n",
133+
" <tr>\n",
134+
" <th>102302</th>\n",
135+
" <td>(programming, pics, funny, politics, worldnews...</td>\n",
136+
" <td>(technology, WTF, reddit.com)</td>\n",
137+
" <td>0.002417</td>\n",
138+
" <td>0.017968</td>\n",
139+
" <td>0.001323</td>\n",
140+
" <td>0.547170</td>\n",
141+
" <td>30.452639</td>\n",
142+
" <td>0.001279</td>\n",
143+
" <td>2.168654</td>\n",
144+
" </tr>\n",
145+
" <tr>\n",
146+
" <th>102303</th>\n",
147+
" <td>(programming, funny, politics, WTF, worldnews,...</td>\n",
148+
" <td>(pics, technology, reddit.com)</td>\n",
149+
" <td>0.002371</td>\n",
150+
" <td>0.017785</td>\n",
151+
" <td>0.001323</td>\n",
152+
" <td>0.557692</td>\n",
153+
" <td>31.356607</td>\n",
154+
" <td>0.001280</td>\n",
155+
" <td>2.220659</td>\n",
156+
" </tr>\n",
157+
" <tr>\n",
158+
" <th>102304</th>\n",
159+
" <td>(programming, pics, funny, politics, WTF, worl...</td>\n",
160+
" <td>(science, technology, reddit.com)</td>\n",
161+
" <td>0.002645</td>\n",
162+
" <td>0.014867</td>\n",
163+
" <td>0.001323</td>\n",
164+
" <td>0.500000</td>\n",
165+
" <td>33.631902</td>\n",
166+
" <td>0.001283</td>\n",
167+
" <td>1.970266</td>\n",
168+
" </tr>\n",
169+
" <tr>\n",
170+
" <th>102305</th>\n",
171+
" <td>(programming, funny, politics, worldnews, tech...</td>\n",
172+
" <td>(science, pics, WTF, reddit.com)</td>\n",
173+
" <td>0.002326</td>\n",
174+
" <td>0.021707</td>\n",
175+
" <td>0.001323</td>\n",
176+
" <td>0.568627</td>\n",
177+
" <td>26.195090</td>\n",
178+
" <td>0.001272</td>\n",
179+
" <td>2.267860</td>\n",
180+
" </tr>\n",
181+
" <tr>\n",
182+
" <th>102306</th>\n",
183+
" <td>(programming, funny, WTF, worldnews, technology)</td>\n",
184+
" <td>(pics, science, politics, reddit.com)</td>\n",
185+
" <td>0.002599</td>\n",
186+
" <td>0.016417</td>\n",
187+
" <td>0.001323</td>\n",
188+
" <td>0.508772</td>\n",
189+
" <td>30.989864</td>\n",
190+
" <td>0.001280</td>\n",
191+
" <td>2.002293</td>\n",
192+
" </tr>\n",
193+
" </tbody>\n",
194+
"</table>\n",
195+
"<p>102307 rows × 9 columns</p>\n",
196+
"</div>"
197+
],
198+
"text/plain": [
199+
" antecedents \\\n",
200+
"0 (IAmA) \n",
201+
"1 (fffffffuuuuuuuuuuuu) \n",
202+
"2 (todayilearned) \n",
203+
"3 (wikipedia) \n",
204+
"4 (bestof) \n",
205+
"... ... \n",
206+
"102302 (programming, pics, funny, politics, worldnews... \n",
207+
"102303 (programming, funny, politics, WTF, worldnews,... \n",
208+
"102304 (programming, pics, funny, politics, WTF, worl... \n",
209+
"102305 (programming, funny, politics, worldnews, tech... \n",
210+
"102306 (programming, funny, WTF, worldnews, technology) \n",
211+
"\n",
212+
" consequents antecedent support \\\n",
213+
"0 (AskReddit) 0.217120 \n",
214+
"1 (IAmA) 0.003648 \n",
215+
"2 (IAmA) 0.013818 \n",
216+
"3 (IAmA) 0.004378 \n",
217+
"4 (IAmA) 0.002782 \n",
218+
"... ... ... \n",
219+
"102302 (technology, WTF, reddit.com) 0.002417 \n",
220+
"102303 (pics, technology, reddit.com) 0.002371 \n",
221+
"102304 (science, technology, reddit.com) 0.002645 \n",
222+
"102305 (science, pics, WTF, reddit.com) 0.002326 \n",
223+
"102306 (pics, science, politics, reddit.com) 0.002599 \n",
224+
"\n",
225+
" consequent support support confidence lift leverage \\\n",
226+
"0 0.480390 0.127371 0.586641 1.221177 0.023069 \n",
227+
"1 0.217120 0.002052 0.562500 2.590737 0.001260 \n",
228+
"2 0.217120 0.008072 0.584158 2.690491 0.005072 \n",
229+
"3 0.217120 0.002280 0.520833 2.398831 0.001330 \n",
230+
"4 0.217120 0.001687 0.606557 2.793655 0.001083 \n",
231+
"... ... ... ... ... ... \n",
232+
"102302 0.017968 0.001323 0.547170 30.452639 0.001279 \n",
233+
"102303 0.017785 0.001323 0.557692 31.356607 0.001280 \n",
234+
"102304 0.014867 0.001323 0.500000 33.631902 0.001283 \n",
235+
"102305 0.021707 0.001323 0.568627 26.195090 0.001272 \n",
236+
"102306 0.016417 0.001323 0.508772 30.989864 0.001280 \n",
237+
"\n",
238+
" conviction \n",
239+
"0 1.257043 \n",
240+
"1 1.789441 \n",
241+
"2 1.882641 \n",
242+
"3 1.633837 \n",
243+
"4 1.989821 \n",
244+
"... ... \n",
245+
"102302 2.168654 \n",
246+
"102303 2.220659 \n",
247+
"102304 1.970266 \n",
248+
"102305 2.267860 \n",
249+
"102306 2.002293 \n",
250+
"\n",
251+
"[102307 rows x 9 columns]"
252+
]
253+
},
254+
"execution_count": 3,
255+
"metadata": {},
256+
"output_type": "execute_result"
257+
}
258+
],
259+
"source": [
260+
"rules"
261+
]
262+
},
263+
{
264+
"cell_type": "code",
265+
"execution_count": 4,
266+
"metadata": {},
267+
"outputs": [],
268+
"source": [
269+
"OBVIOUS_SUBREDDITS = {'reddit.com', 'WTF'}"
270+
]
271+
},
272+
{
273+
"cell_type": "code",
274+
"execution_count": 5,
275+
"metadata": {},
276+
"outputs": [],
277+
"source": [
278+
"def get_rules(set_of_subreddits):\n",
279+
" antecedents_rule = rules['antecedents'].apply(lambda x: x.issuperset(set_of_subreddits))\n",
280+
" new_rules = rules[antecedents_rule].copy()\n",
281+
" new_rules[\"Coefficient\"] = new_rules[\"confidence\"] + new_rules[\"support\"]#new_rules[\"lift\"]\n",
282+
" new_rules[\"consequents\"] = new_rules[\"consequents\"].apply(lambda x: x - set_of_subreddits - OBVIOUS_SUBREDDITS)\n",
283+
" new_rules = new_rules[new_rules[\"consequents\"].apply(lambda x: len(x) > 0)][['consequents', \"Coefficient\"]]\n",
284+
" new_rules = new_rules.explode(\"consequents\")\n",
285+
" new_rules[\"consequents\"] = new_rules[\"consequents\"].apply(lambda x:list(x)[0])\n",
286+
" new_rules = new_rules.groupby(\"consequents\")[\"Coefficient\"].max().reset_index() \n",
287+
" return list(new_rules.nlargest(20, \"Coefficient\")[\"consequents\"])"
288+
]
289+
},
290+
{
291+
"cell_type": "code",
292+
"execution_count": 11,
293+
"metadata": {},
294+
"outputs": [
295+
{
296+
"data": {
297+
"text/plain": [
298+
"['AskReddit',\n",
299+
" 'funny',\n",
300+
" 'pics',\n",
301+
" 'science',\n",
302+
" 'politics',\n",
303+
" 'IAmA',\n",
304+
" 'worldnews',\n",
305+
" 'technology',\n",
306+
" 'gaming',\n",
307+
" 'atheism']"
308+
]
309+
},
310+
"execution_count": 11,
311+
"metadata": {},
312+
"output_type": "execute_result"
313+
}
314+
],
315+
"source": [
316+
"df = get_rules({'programming'})\n",
317+
"df"
318+
]
319+
},
320+
{
321+
"cell_type": "code",
322+
"execution_count": null,
323+
"metadata": {},
324+
"outputs": [],
325+
"source": []
326+
},
3327
{
4328
"cell_type": "code",
5329
"execution_count": null,
@@ -10,9 +334,9 @@
10334
],
11335
"metadata": {
12336
"kernelspec": {
13-
"display_name": "Python [conda env:dataviz]",
337+
"display_name": "Python 3",
14338
"language": "python",
15-
"name": "conda-env-dataviz-py"
339+
"name": "python3"
16340
},
17341
"language_info": {
18342
"codemirror_mode": {
@@ -24,7 +348,7 @@
24348
"name": "python",
25349
"nbconvert_exporter": "python",
26350
"pygments_lexer": "ipython3",
27-
"version": "3.8.8"
351+
"version": "3.7.6"
28352
}
29353
},
30354
"nbformat": 4,

Untitled.ipynb

+7-5
Original file line numberDiff line numberDiff line change
@@ -17819,7 +17819,7 @@
1781917819
],
1782017820
"source": [
1782117821
"frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)\n",
17822-
"association_rules(frequent_itemsets, \n",
17822+
"rules = association_rules(frequent_itemsets, \n",
1782317823
" metric='confidence', \n",
1782417824
" min_threshold=0.7)"
1782517825
]
@@ -17829,14 +17829,16 @@
1782917829
"execution_count": null,
1783017830
"metadata": {},
1783117831
"outputs": [],
17832-
"source": []
17832+
"source": [
17833+
"rules.to_json(\"data/arules.json\")"
17834+
]
1783317835
}
1783417836
],
1783517837
"metadata": {
1783617838
"kernelspec": {
17837-
"display_name": "Python 3",
17839+
"display_name": "Python [conda env:dataviz]",
1783817840
"language": "python",
17839-
"name": "python3"
17841+
"name": "conda-env-dataviz-py"
1784017842
},
1784117843
"language_info": {
1784217844
"codemirror_mode": {
@@ -17848,7 +17850,7 @@
1784817850
"name": "python",
1784917851
"nbconvert_exporter": "python",
1785017852
"pygments_lexer": "ipython3",
17851-
"version": "3.8.5"
17853+
"version": "3.8.8"
1785217854
},
1785317855
"toc": {
1785417856
"base_numbering": 1,

0 commit comments

Comments
 (0)