FirstGalacticEmpire
diff --git a/‎PythonDB.ipynb
+327-3 b/‎PythonDB.ipynb
+327-3
diff --git a/‎Untitled.ipynb
+7-5 b/‎Untitled.ipynb
+7-5
@@ -1,5 +1,329 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rules = pd.read_json(\"data/arules.json\")\n",
+    "rules[\"antecedents\"] = rules[\"antecedents\"].apply(lambda x: frozenset(x))\n",
+    "rules[\"consequents\"] = rules[\"consequents\"].apply(lambda x: frozenset(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>antecedents</th>\n",
+       "      <th>consequents</th>\n",
+       "      <th>antecedent support</th>\n",
+       "      <th>consequent support</th>\n",
+       "      <th>support</th>\n",
+       "      <th>confidence</th>\n",
+       "      <th>lift</th>\n",
+       "      <th>leverage</th>\n",
+       "      <th>conviction</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>(IAmA)</td>\n",
+       "      <td>(AskReddit)</td>\n",
+       "      <td>0.217120</td>\n",
+       "      <td>0.480390</td>\n",
+       "      <td>0.127371</td>\n",
+       "      <td>0.586641</td>\n",
+       "      <td>1.221177</td>\n",
+       "      <td>0.023069</td>\n",
+       "      <td>1.257043</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>(fffffffuuuuuuuuuuuu)</td>\n",
+       "      <td>(IAmA)</td>\n",
+       "      <td>0.003648</td>\n",
+       "      <td>0.217120</td>\n",
+       "      <td>0.002052</td>\n",
+       "      <td>0.562500</td>\n",
+       "      <td>2.590737</td>\n",
+       "      <td>0.001260</td>\n",
+       "      <td>1.789441</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>(todayilearned)</td>\n",
+       "      <td>(IAmA)</td>\n",
+       "      <td>0.013818</td>\n",
+       "      <td>0.217120</td>\n",
+       "      <td>0.008072</td>\n",
+       "      <td>0.584158</td>\n",
+       "      <td>2.690491</td>\n",
+       "      <td>0.005072</td>\n",
+       "      <td>1.882641</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>(wikipedia)</td>\n",
+       "      <td>(IAmA)</td>\n",
+       "      <td>0.004378</td>\n",
+       "      <td>0.217120</td>\n",
+       "      <td>0.002280</td>\n",
+       "      <td>0.520833</td>\n",
+       "      <td>2.398831</td>\n",
+       "      <td>0.001330</td>\n",
+       "      <td>1.633837</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>(bestof)</td>\n",
+       "      <td>(IAmA)</td>\n",
+       "      <td>0.002782</td>\n",
+       "      <td>0.217120</td>\n",
+       "      <td>0.001687</td>\n",
+       "      <td>0.606557</td>\n",
+       "      <td>2.793655</td>\n",
+       "      <td>0.001083</td>\n",
+       "      <td>1.989821</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>102302</th>\n",
+       "      <td>(programming, pics, funny, politics, worldnews...</td>\n",
+       "      <td>(technology, WTF, reddit.com)</td>\n",
+       "      <td>0.002417</td>\n",
+       "      <td>0.017968</td>\n",
+       "      <td>0.001323</td>\n",
+       "      <td>0.547170</td>\n",
+       "      <td>30.452639</td>\n",
+       "      <td>0.001279</td>\n",
+       "      <td>2.168654</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>102303</th>\n",
+       "      <td>(programming, funny, politics, WTF, worldnews,...</td>\n",
+       "      <td>(pics, technology, reddit.com)</td>\n",
+       "      <td>0.002371</td>\n",
+       "      <td>0.017785</td>\n",
+       "      <td>0.001323</td>\n",
+       "      <td>0.557692</td>\n",
+       "      <td>31.356607</td>\n",
+       "      <td>0.001280</td>\n",
+       "      <td>2.220659</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>102304</th>\n",
+       "      <td>(programming, pics, funny, politics, WTF, worl...</td>\n",
+       "      <td>(science, technology, reddit.com)</td>\n",
+       "      <td>0.002645</td>\n",
+       "      <td>0.014867</td>\n",
+       "      <td>0.001323</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>33.631902</td>\n",
+       "      <td>0.001283</td>\n",
+       "      <td>1.970266</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>102305</th>\n",
+       "      <td>(programming, funny, politics, worldnews, tech...</td>\n",
+       "      <td>(science, pics, WTF, reddit.com)</td>\n",
+       "      <td>0.002326</td>\n",
+       "      <td>0.021707</td>\n",
+       "      <td>0.001323</td>\n",
+       "      <td>0.568627</td>\n",
+       "      <td>26.195090</td>\n",
+       "      <td>0.001272</td>\n",
+       "      <td>2.267860</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>102306</th>\n",
+       "      <td>(programming, funny, WTF, worldnews, technology)</td>\n",
+       "      <td>(pics, science, politics, reddit.com)</td>\n",
+       "      <td>0.002599</td>\n",
+       "      <td>0.016417</td>\n",
+       "      <td>0.001323</td>\n",
+       "      <td>0.508772</td>\n",
+       "      <td>30.989864</td>\n",
+       "      <td>0.001280</td>\n",
+       "      <td>2.002293</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>102307 rows × 9 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              antecedents  \\\n",
+       "0                                                  (IAmA)   \n",
+       "1                                   (fffffffuuuuuuuuuuuu)   \n",
+       "2                                         (todayilearned)   \n",
+       "3                                             (wikipedia)   \n",
+       "4                                                (bestof)   \n",
+       "...                                                   ...   \n",
+       "102302  (programming, pics, funny, politics, worldnews...   \n",
+       "102303  (programming, funny, politics, WTF, worldnews,...   \n",
+       "102304  (programming, pics, funny, politics, WTF, worl...   \n",
+       "102305  (programming, funny, politics, worldnews, tech...   \n",
+       "102306   (programming, funny, WTF, worldnews, technology)   \n",
+       "\n",
+       "                                  consequents  antecedent support  \\\n",
+       "0                                 (AskReddit)            0.217120   \n",
+       "1                                      (IAmA)            0.003648   \n",
+       "2                                      (IAmA)            0.013818   \n",
+       "3                                      (IAmA)            0.004378   \n",
+       "4                                      (IAmA)            0.002782   \n",
+       "...                                       ...                 ...   \n",
+       "102302          (technology, WTF, reddit.com)            0.002417   \n",
+       "102303         (pics, technology, reddit.com)            0.002371   \n",
+       "102304      (science, technology, reddit.com)            0.002645   \n",
+       "102305       (science, pics, WTF, reddit.com)            0.002326   \n",
+       "102306  (pics, science, politics, reddit.com)            0.002599   \n",
+       "\n",
+       "        consequent support   support  confidence       lift  leverage  \\\n",
+       "0                 0.480390  0.127371    0.586641   1.221177  0.023069   \n",
+       "1                 0.217120  0.002052    0.562500   2.590737  0.001260   \n",
+       "2                 0.217120  0.008072    0.584158   2.690491  0.005072   \n",
+       "3                 0.217120  0.002280    0.520833   2.398831  0.001330   \n",
+       "4                 0.217120  0.001687    0.606557   2.793655  0.001083   \n",
+       "...                    ...       ...         ...        ...       ...   \n",
+       "102302            0.017968  0.001323    0.547170  30.452639  0.001279   \n",
+       "102303            0.017785  0.001323    0.557692  31.356607  0.001280   \n",
+       "102304            0.014867  0.001323    0.500000  33.631902  0.001283   \n",
+       "102305            0.021707  0.001323    0.568627  26.195090  0.001272   \n",
+       "102306            0.016417  0.001323    0.508772  30.989864  0.001280   \n",
+       "\n",
+       "        conviction  \n",
+       "0         1.257043  \n",
+       "1         1.789441  \n",
+       "2         1.882641  \n",
+       "3         1.633837  \n",
+       "4         1.989821  \n",
+       "...            ...  \n",
+       "102302    2.168654  \n",
+       "102303    2.220659  \n",
+       "102304    1.970266  \n",
+       "102305    2.267860  \n",
+       "102306    2.002293  \n",
+       "\n",
+       "[102307 rows x 9 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "rules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OBVIOUS_SUBREDDITS = {'reddit.com', 'WTF'}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_rules(set_of_subreddits):\n",
+    "    antecedents_rule =  rules['antecedents'].apply(lambda x: x.issuperset(set_of_subreddits))\n",
+    "    new_rules =  rules[antecedents_rule].copy()\n",
+    "    new_rules[\"Coefficient\"] = new_rules[\"confidence\"] + new_rules[\"support\"]#new_rules[\"lift\"]\n",
+    "    new_rules[\"consequents\"] = new_rules[\"consequents\"].apply(lambda x: x - set_of_subreddits - OBVIOUS_SUBREDDITS)\n",
+    "    new_rules = new_rules[new_rules[\"consequents\"].apply(lambda x: len(x) > 0)][['consequents', \"Coefficient\"]]\n",
+    "    new_rules = new_rules.explode(\"consequents\")\n",
+    "    new_rules[\"consequents\"] = new_rules[\"consequents\"].apply(lambda x:list(x)[0])\n",
+    "    new_rules = new_rules.groupby(\"consequents\")[\"Coefficient\"].max().reset_index()    \n",
+    "    return list(new_rules.nlargest(20, \"Coefficient\")[\"consequents\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['AskReddit',\n",
+       " 'funny',\n",
+       " 'pics',\n",
+       " 'science',\n",
+       " 'politics',\n",
+       " 'IAmA',\n",
+       " 'worldnews',\n",
+       " 'technology',\n",
+       " 'gaming',\n",
+       " 'atheism']"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = get_rules({'programming'})\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -10,9 +334,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:dataviz]",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "conda-env-dataviz-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -24,7 +348,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.7.6"
   }
  },
  "nbformat": 4,
 
@@ -17819,7 +17819,7 @@
    ],
    "source": [
     "frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)\n",
-    "association_rules(frequent_itemsets, \n",
+    "rules = association_rules(frequent_itemsets, \n",
     "                  metric='confidence', \n",
     "                  min_threshold=0.7)"
    ]
@@ -17829,14 +17829,16 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "rules.to_json(\"data/arules.json\")"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [conda env:dataviz]",
    "language": "python",
-   "name": "python3"
+   "name": "conda-env-dataviz-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -17848,7 +17850,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.8"
   },
   "toc": {
    "base_numbering": 1,