From ecd0c741da11c6a3aa9970978e102be9122ae958 Mon Sep 17 00:00:00 2001
From: cristinarosa97 <cristinarosaquero@gmail.com>
Date: Fri, 6 Dec 2024 17:09:07 +0100
Subject: [PATCH] w2 lab3 done

---
 lab-dw-data-structuring-and-combining.ipynb | 944 +++++++++++++++++++-
 1 file changed, 935 insertions(+), 9 deletions(-)

diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb
index ec4e3f9..999ee6d 100644
--- a/lab-dw-data-structuring-and-combining.ipynb
+++ b/lab-dw-data-structuring-and-combining.ipynb
@@ -36,14 +36,400 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "492d06e3-92c7-4105-ac72-536db98d3244",
    "metadata": {
     "id": "492d06e3-92c7-4105-ac72-536db98d3244"
    },
    "outputs": [],
    "source": [
-    "# Your code goes here"
+    "import pandas as pd\n",
+    "\n",
+    "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n",
+    "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\")\n",
+    "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fe258bb4-cbf3-4d12-a0cc-aaadbaafa392",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1.columns = df1.columns.str.strip().str.lower()\n",
+    "df2.columns = df2.columns.str.strip().str.lower()\n",
+    "df3.columns = df3.columns.str.strip().str.lower()\n",
+    "df1.rename(columns = {\"st\":\"state\"}, inplace=True)\n",
+    "df2.rename(columns = {\"st\":\"state\"}, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "dece8c2d-4b35-49cc-a2af-55ab4863c6ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['customer', 'state', 'gender', 'education', 'customer lifetime value',\n",
+      "       'income', 'monthly premium auto', 'number of open complaints',\n",
+      "       'policy type', 'vehicle class', 'total claim amount'],\n",
+      "      dtype='object')\n",
+      "Index(['customer', 'state', 'gender', 'education', 'customer lifetime value',\n",
+      "       'income', 'monthly premium auto', 'number of open complaints',\n",
+      "       'total claim amount', 'policy type', 'vehicle class'],\n",
+      "      dtype='object')\n",
+      "Index(['customer', 'state', 'customer lifetime value', 'education', 'gender',\n",
+      "       'income', 'monthly premium auto', 'number of open complaints',\n",
+      "       'policy type', 'total claim amount', 'vehicle class'],\n",
+      "      dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df1.columns)\n",
+    "print(df2.columns)\n",
+    "print(df3.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4e21e48a-74a7-40f1-afd0-ad46fca2114f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(12074, 11)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.concat([df1, df2, df3], ignore_index=True)\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "841c003d-96fb-44b0-adeb-56d64b37a5f1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['customer', 'state', 'gender', 'education', 'customer_lifetime_value',\n",
+       "       'income', 'monthly_premium_auto', 'number_of_open_complaints',\n",
+       "       'policy_type', 'vehicle_class', 'total_claim_amount'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.columns = [column.replace(\" \", \"_\") for column in df.columns]\n",
+    "\n",
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "490500ef-f69f-4162-ab39-4fab87ce5c28",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2939"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.duplicated().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "99ab7bd2-b289-4198-a4c2-4e6c8b86f43f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\crist\\AppData\\Local\\Temp\\ipykernel_31220\\2421099744.py:7: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  df[\"gender\"].fillna(\"Unknown\", inplace=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "gender_mapping = {\"M\":\"M\",\n",
+    "                  \"F\":\"F\",\n",
+    "                  \"Male\":\"M\",\n",
+    "                  \"female\": \"F\", \n",
+    "                  \"Femal\": \"F\"}\n",
+    "df[\"gender\"] = df[\"gender\"].map(gender_mapping)\n",
+    "df[\"gender\"].fillna(\"Unknown\", inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "9518002d-5775-416a-8545-329d0ff72f60",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\crist\\AppData\\Local\\Temp\\ipykernel_31220\\2702104280.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  df[\"state\"].replace(\"Cali\", \"California\", inplace=True)\n",
+      "C:\\Users\\crist\\AppData\\Local\\Temp\\ipykernel_31220\\2702104280.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  df[\"state\"].replace(\"AZ\", \"Arizona\",inplace=True)\n",
+      "C:\\Users\\crist\\AppData\\Local\\Temp\\ipykernel_31220\\2702104280.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  df[\"state\"].replace(\"WA\", \"Washington\",inplace=True)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon', nan],\n",
+       "      dtype=object)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"state\"].replace(\"Cali\", \"California\", inplace=True)\n",
+    "df[\"state\"].replace(\"AZ\", \"Arizona\",inplace=True)\n",
+    "df[\"state\"].replace(\"WA\", \"Washington\",inplace=True)\n",
+    "\n",
+    "df[\"state\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0743cb66-ffe2-45d6-a25a-d22ddf2f0fea",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\crist\\AppData\\Local\\Temp\\ipykernel_31220\\2476333873.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  df[\"education\"].replace(\"Bachelors\", \"Bachelor\",inplace=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "df[\"education\"].replace(\"Bachelors\", \"Bachelor\",inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "33ef5b50-3b5d-4b47-90be-7c8475bfe9a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['customer_lifetime_value'] = df['customer_lifetime_value'].str.replace('%', '', regex=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "8891a27b-2703-4241-9e2c-bbebf8db9c20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_luxury(car):\n",
+    "    if car == \"Sports Car\":\n",
+    "        return \"Luxury\"\n",
+    "    elif car == \"Luxury SUV\":\n",
+    "        return \"Luxury\"\n",
+    "    elif car == \"Luxury Car\":\n",
+    "        return \"Luxury\"\n",
+    "    else:\n",
+    "        return car\n",
+    "\n",
+    "df[\"vehicle_class\"] = df[\"vehicle_class\"].apply(convert_luxury)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "0880609b-be9a-4c44-bb93-1a65603b2767",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"customer_lifetime_value\"] = pd.to_numeric(df[\"customer_lifetime_value\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "66758554-04c4-4d4c-bbe9-855d9994bca3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "customer                      2937\n",
+       "state                         2937\n",
+       "gender                           0\n",
+       "education                     2937\n",
+       "customer_lifetime_value      10014\n",
+       "income                        2937\n",
+       "monthly_premium_auto          2937\n",
+       "number_of_open_complaints     2937\n",
+       "policy_type                   2937\n",
+       "vehicle_class                 2937\n",
+       "total_claim_amount            2937\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.isnull().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "2fb340f3-57d0-4b20-89e4-709af4145eb6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "customer                        0\n",
+       "state                           0\n",
+       "gender                          0\n",
+       "education                       0\n",
+       "customer_lifetime_value      7077\n",
+       "income                          0\n",
+       "monthly_premium_auto            0\n",
+       "number_of_open_complaints       0\n",
+       "policy_type                     0\n",
+       "vehicle_class                   0\n",
+       "total_claim_amount              0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.dropna(thresh=6, inplace=True) \n",
+    "df.isnull().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "cefdc694-d386-427a-b8a2-9cf4a8663e36",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False    9134\n",
+       "True        3\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.duplicated().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "3ffb74c0-fc6a-4fc9-83a5-a2f287aa0840",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False    9134\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.drop_duplicates(inplace=True)\n",
+    "df.duplicated().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "1889052a-254f-486d-9ad7-a0ef96bb4755",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.reset_index(inplace=True)"
    ]
   },
   {
@@ -72,14 +458,349 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 37,
    "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26",
    "metadata": {
     "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>unnamed:_0</th>\n",
+       "      <th>customer</th>\n",
+       "      <th>state</th>\n",
+       "      <th>customer_lifetime_value</th>\n",
+       "      <th>response</th>\n",
+       "      <th>coverage</th>\n",
+       "      <th>education</th>\n",
+       "      <th>effective_to_date</th>\n",
+       "      <th>employmentstatus</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>...</th>\n",
+       "      <th>number_of_policies</th>\n",
+       "      <th>policy_type</th>\n",
+       "      <th>policy</th>\n",
+       "      <th>renew_offer_type</th>\n",
+       "      <th>sales_channel</th>\n",
+       "      <th>total_claim_amount</th>\n",
+       "      <th>vehicle_class</th>\n",
+       "      <th>vehicle_size</th>\n",
+       "      <th>vehicle_type</th>\n",
+       "      <th>month</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>DK49336</td>\n",
+       "      <td>Arizona</td>\n",
+       "      <td>4809.216960</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Basic</td>\n",
+       "      <td>College</td>\n",
+       "      <td>2011-02-18</td>\n",
+       "      <td>Employed</td>\n",
+       "      <td>M</td>\n",
+       "      <td>...</td>\n",
+       "      <td>9</td>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>Corporate L3</td>\n",
+       "      <td>Offer3</td>\n",
+       "      <td>Agent</td>\n",
+       "      <td>292.800000</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>Medsize</td>\n",
+       "      <td>A</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>KX64629</td>\n",
+       "      <td>California</td>\n",
+       "      <td>2228.525238</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Basic</td>\n",
+       "      <td>College</td>\n",
+       "      <td>2011-01-18</td>\n",
+       "      <td>Unemployed</td>\n",
+       "      <td>F</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Personal L3</td>\n",
+       "      <td>Offer4</td>\n",
+       "      <td>Call Center</td>\n",
+       "      <td>744.924331</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>Medsize</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>LZ68649</td>\n",
+       "      <td>Washington</td>\n",
+       "      <td>14947.917300</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Basic</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>2011-02-10</td>\n",
+       "      <td>Employed</td>\n",
+       "      <td>M</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Personal L3</td>\n",
+       "      <td>Offer3</td>\n",
+       "      <td>Call Center</td>\n",
+       "      <td>480.000000</td>\n",
+       "      <td>SUV</td>\n",
+       "      <td>Medsize</td>\n",
+       "      <td>A</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>XL78013</td>\n",
+       "      <td>Oregon</td>\n",
+       "      <td>22332.439460</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Extended</td>\n",
+       "      <td>College</td>\n",
+       "      <td>2011-01-11</td>\n",
+       "      <td>Employed</td>\n",
+       "      <td>M</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>Corporate L3</td>\n",
+       "      <td>Offer2</td>\n",
+       "      <td>Branch</td>\n",
+       "      <td>484.013411</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>Medsize</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>QA50777</td>\n",
+       "      <td>Oregon</td>\n",
+       "      <td>9025.067525</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Premium</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>2011-01-17</td>\n",
+       "      <td>Medical Leave</td>\n",
+       "      <td>F</td>\n",
+       "      <td>...</td>\n",
+       "      <td>7</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Personal L2</td>\n",
+       "      <td>Offer1</td>\n",
+       "      <td>Branch</td>\n",
+       "      <td>707.925645</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>Medsize</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 27 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   unnamed:_0 customer       state  customer_lifetime_value response  \\\n",
+       "0           0  DK49336     Arizona              4809.216960       No   \n",
+       "1           1  KX64629  California              2228.525238       No   \n",
+       "2           2  LZ68649  Washington             14947.917300       No   \n",
+       "3           3  XL78013      Oregon             22332.439460      Yes   \n",
+       "4           4  QA50777      Oregon              9025.067525       No   \n",
+       "\n",
+       "   coverage education effective_to_date employmentstatus gender  ...  \\\n",
+       "0     Basic   College        2011-02-18         Employed      M  ...   \n",
+       "1     Basic   College        2011-01-18       Unemployed      F  ...   \n",
+       "2     Basic  Bachelor        2011-02-10         Employed      M  ...   \n",
+       "3  Extended   College        2011-01-11         Employed      M  ...   \n",
+       "4   Premium  Bachelor        2011-01-17    Medical Leave      F  ...   \n",
+       "\n",
+       "   number_of_policies     policy_type        policy  renew_offer_type  \\\n",
+       "0                   9  Corporate Auto  Corporate L3            Offer3   \n",
+       "1                   1   Personal Auto   Personal L3            Offer4   \n",
+       "2                   2   Personal Auto   Personal L3            Offer3   \n",
+       "3                   2  Corporate Auto  Corporate L3            Offer2   \n",
+       "4                   7   Personal Auto   Personal L2            Offer1   \n",
+       "\n",
+       "   sales_channel  total_claim_amount  vehicle_class  vehicle_size  \\\n",
+       "0          Agent          292.800000  Four-Door Car       Medsize   \n",
+       "1    Call Center          744.924331  Four-Door Car       Medsize   \n",
+       "2    Call Center          480.000000            SUV       Medsize   \n",
+       "3         Branch          484.013411  Four-Door Car       Medsize   \n",
+       "4         Branch          707.925645  Four-Door Car       Medsize   \n",
+       "\n",
+       "  vehicle_type month  \n",
+       "0            A     2  \n",
+       "1            A     1  \n",
+       "2            A     2  \n",
+       "3            A     1  \n",
+       "4            A     1  \n",
+       "\n",
+       "[5 rows x 27 columns]"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "marketing_df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\")\n",
+    "marketing_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "0a843116-b120-4a36-894e-ef640253a8e7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 10910 entries, 0 to 10909\n",
+      "Data columns (total 27 columns):\n",
+      " #   Column                         Non-Null Count  Dtype  \n",
+      "---  ------                         --------------  -----  \n",
+      " 0   unnamed:_0                     10910 non-null  int64  \n",
+      " 1   customer                       10910 non-null  object \n",
+      " 2   state                          10910 non-null  object \n",
+      " 3   customer_lifetime_value        10910 non-null  float64\n",
+      " 4   response                       10910 non-null  object \n",
+      " 5   coverage                       10910 non-null  object \n",
+      " 6   education                      10910 non-null  object \n",
+      " 7   effective_to_date              10910 non-null  object \n",
+      " 8   employmentstatus               10910 non-null  object \n",
+      " 9   gender                         10910 non-null  object \n",
+      " 10  income                         10910 non-null  int64  \n",
+      " 11  location_code                  10910 non-null  object \n",
+      " 12  marital_status                 10910 non-null  object \n",
+      " 13  monthly_premium_auto           10910 non-null  int64  \n",
+      " 14  months_since_last_claim        10910 non-null  float64\n",
+      " 15  months_since_policy_inception  10910 non-null  int64  \n",
+      " 16  number_of_open_complaints      10910 non-null  float64\n",
+      " 17  number_of_policies             10910 non-null  int64  \n",
+      " 18  policy_type                    10910 non-null  object \n",
+      " 19  policy                         10910 non-null  object \n",
+      " 20  renew_offer_type               10910 non-null  object \n",
+      " 21  sales_channel                  10910 non-null  object \n",
+      " 22  total_claim_amount             10910 non-null  float64\n",
+      " 23  vehicle_class                  10910 non-null  object \n",
+      " 24  vehicle_size                   10910 non-null  object \n",
+      " 25  vehicle_type                   10910 non-null  object \n",
+      " 26  month                          10910 non-null  int64  \n",
+      "dtypes: float64(4), int64(6), object(17)\n",
+      "memory usage: 2.2+ MB\n"
+     ]
+    }
+   ],
    "source": [
-    "# Your code goes here"
+    "marketing_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "1045a6ed-0173-4915-9a0c-8a54aba21ea3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "unnamed:_0                       0\n",
+       "customer                         0\n",
+       "state                            0\n",
+       "customer_lifetime_value          0\n",
+       "response                         0\n",
+       "coverage                         0\n",
+       "education                        0\n",
+       "effective_to_date                0\n",
+       "employmentstatus                 0\n",
+       "gender                           0\n",
+       "income                           0\n",
+       "location_code                    0\n",
+       "marital_status                   0\n",
+       "monthly_premium_auto             0\n",
+       "months_since_last_claim          0\n",
+       "months_since_policy_inception    0\n",
+       "number_of_open_complaints        0\n",
+       "number_of_policies               0\n",
+       "policy_type                      0\n",
+       "policy                           0\n",
+       "renew_offer_type                 0\n",
+       "sales_channel                    0\n",
+       "total_claim_amount               0\n",
+       "vehicle_class                    0\n",
+       "vehicle_size                     0\n",
+       "vehicle_type                     0\n",
+       "month                            0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "marketing_df.isnull().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "e6e36267-91fe-495b-9811-063b944af8c1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "marketing_df.duplicated().sum()"
    ]
   },
   {
@@ -93,6 +814,80 @@
     "Round the total revenue to 2 decimal points.  Analyze the resulting table to draw insights."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "a7cf5c17-410b-402d-856d-722e0224f4b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>total_claim_amount</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>sales_channel</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Agent</th>\n",
+       "      <td>1810226.82</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Branch</th>\n",
+       "      <td>1301204.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Call Center</th>\n",
+       "      <td>926600.82</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Web</th>\n",
+       "      <td>706600.04</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               total_claim_amount\n",
+       "sales_channel                    \n",
+       "Agent                  1810226.82\n",
+       "Branch                 1301204.00\n",
+       "Call Center             926600.82\n",
+       "Web                     706600.04"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pivot_df = marketing_df.pivot_table(index=\"sales_channel\",values=\"total_claim_amount\", aggfunc = \"sum\")\n",
+    "round(pivot_df, 2)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "640993b2-a291-436c-a34d-a551144f8196",
@@ -103,6 +898,91 @@
     "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "0303333f-2263-44fc-b7fd-d5c832e99bae",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>education</th>\n",
+       "      <th>Bachelor</th>\n",
+       "      <th>College</th>\n",
+       "      <th>Doctor</th>\n",
+       "      <th>High School or Below</th>\n",
+       "      <th>Master</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>gender</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>F</th>\n",
+       "      <td>7874.269478</td>\n",
+       "      <td>7748.823325</td>\n",
+       "      <td>7328.508916</td>\n",
+       "      <td>8675.220201</td>\n",
+       "      <td>8157.053154</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>M</th>\n",
+       "      <td>7703.601675</td>\n",
+       "      <td>8052.459288</td>\n",
+       "      <td>7415.333638</td>\n",
+       "      <td>8149.687783</td>\n",
+       "      <td>8168.832659</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "education     Bachelor      College       Doctor  High School or Below  \\\n",
+       "gender                                                                   \n",
+       "F          7874.269478  7748.823325  7328.508916           8675.220201   \n",
+       "M          7703.601675  8052.459288  7415.333638           8149.687783   \n",
+       "\n",
+       "education       Master  \n",
+       "gender                  \n",
+       "F          8157.053154  \n",
+       "M          8168.832659  "
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "avg_customer_liftime_value = marketing_df.pivot_table(index=\"gender\",columns=\"education\", values=\"customer_lifetime_value\", aggfunc =\"mean\")\n",
+    "avg_customer_liftime_value"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "32c7f2e5-3d90-43e5-be33-9781b6069198",
@@ -130,14 +1010,60 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 71,
    "id": "3a069e0b-b400-470e-904d-d17582191be4",
    "metadata": {
     "id": "3a069e0b-b400-470e-904d-d17582191be4"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>month</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>number_of_open_complaints</th>\n",
+       "      <td>5.0</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "month                        1    2\n",
+       "number_of_open_complaints  5.0  5.0"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# Your code goes here"
+    "highest_complaints = marketing_df.pivot_table(columns=\"month\", values=\"number_of_open_complaints\", aggfunc =\"max\")\n",
+    "highest_complaints"
    ]
   }
  ],
@@ -160,7 +1086,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.12.4"
   }
  },
  "nbformat": 4,