From da34d93f3c202f8ffac4df01940a2879efaa7801 Mon Sep 17 00:00:00 2001 From: peterjwilliams1 Date: Wed, 15 Jan 2025 16:29:41 +0100 Subject: [PATCH] Updated --- ...structuring-and-combining-checkpoint.ipynb | 1698 +++++++++++++++++ lab-dw-data-structuring-and-combining.ipynb | 1544 ++++++++++++++- 2 files changed, 3235 insertions(+), 7 deletions(-) create mode 100644 .ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb diff --git a/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb new file mode 100644 index 0000000..02133a7 --- /dev/null +++ b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb @@ -0,0 +1,1698 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e", + "metadata": { + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e" + }, + "source": [ + "# Lab | Data Structuring and Combining Data" + ] + }, + { + "cell_type": "markdown", + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986", + "metadata": { + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986" + }, + "source": [ + "## Challenge 1: Combining & Cleaning Data\n", + "\n", + "In this challenge, we will be working with the customer data from an insurance company, as we did in the two previous labs. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\n", + "\n", + "But this time, we got new data, which can be found in the following 2 CSV files located at the links below.\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\n", + "\n", + "Note that you'll need to clean and format the new data.\n", + "\n", + "Observation:\n", + "- One option is to first combine the three datasets and then apply the cleaning function to the new combined dataset\n", + "- Another option would be to read the clean file you saved in the previous lab, and just clean the two new files and concatenate the three clean datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "ef00922f-b0ec-418c-87c7-3444acae3faa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 customer\n", + "1 st\n", + "2 gender\n", + "3 education\n", + "4 customer lifetime value\n", + "5 income\n", + "6 monthly premium auto\n", + "7 number of open complaints\n", + "8 policy type\n", + "9 vehicle class\n", + "10 total claim amount\n", + "dtype: object" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# --- DATA CLEANING STEPS ---\n", + "import pandas as pd\n", + "url1=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "url2=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\"\n", + "url3=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n", + "df1 = pd.read_csv(url1)\n", + "df2 = pd.read_csv(url2)\n", + "df3 = pd.read_csv(url3)\n", + "\n", + "----------\n", + "\n", + "#1 Lowercase everything:- \n", + "\n", + "df1.columns = pd.Series(df1.columns).apply(lambda col: col.lower())\n", + "df1.columns\n", + "df2.columns = pd.Series(df2.columns).apply(lambda col: col.lower())\n", + "df2.columns\n", + "df3.columns = pd.Series(df3.columns).apply(lambda col: col.lower())\n", + "df3.columns\n", + "----------\n", + "\n", + "#2 Change / Replace names of columns:-\n", + "\n", + "df = df.rename(columns={'st': 'State'})\n", + "----------\n", + "\n", + "#3 Remove null values:-\n", + "\n", + " # Checking for Null Values\n", + "df.isnull() # Returns a DataFrame with True where values are null\n", + " # Count the number of null values in each column\n", + "df.isna().sum()\n", + " # Dropping rows with Null Values\n", + "df.dropna(inplace=True)\n", + "----------\n", + "\n", + "# 4 Find Unique Values\n", + "\n", + "unique_genders = df['gender'].unique()\n", + "female_variations = ['Femal', 'Female']\n", + "# Replace the variations with 'F'\n", + "df['gender'] = df['gender'].replace(female_variations, 'F')\n", + "print(df)\n", + " # - Also in comprehension\n", + "\n", + "df['gender'] = df['gender'].apply(lambda x: 'F' if x in ('Femal', 'Female') else ('M' if x == 'Male' else x))\n", + "#-or-\n", + "df['education'] = df['education'].apply(lambda x: 'Bachelor' if x == 'Bachelors' else x)\n", + "#-or-\n", + "df['vehicle_class'] = df['vehicle_class'].str.replace('Sports Car', 'Luxury')\n", + "df['vehicle_class'] = df['vehicle_class'].str.replace('Luxury SUV', 'Luxury')\n", + "df['vehicle_class'] = df['vehicle_class'].str.replace('Luxury Car', 'Luxury')\n", + "\n", + "vc = df['vehicle_class'].unique()\n", + "print(vc)\n", + "----------\n", + "\n", + "#5 Combine Tables\n", + " # Concatenate the sales, revenue, and costs DataFrames vertically (along rows)\n", + "pd.concat([df1, df2, df3], axis=0)\n", + " # Concatenate the sales, revenue, and costs DataFrames horizontally (along columns)\n", + "pd.concat([df1, df2, df3], axis=1) # by default is outer, takes the union\n", + "\n", + "#6 Manually Re-order the Columns:-\n", + " # Specify the desired column order - *** after they have been lower cased and standardized\n", + "desired_order = ['customer', 'state','gender', 'education', 'Customer Lifetime Value', 'Income', 'Monthly Premium Auto','Number of Open Complaints', 'Total Claim Amount', 'Policy Type', 'Vehicle Class'] # Replace with your desired column names,\n", + "\n", + " # Check if all desired columns are in the DataFrame\n", + "if all(column in df1.columns for column in desired_order):\n", + " # Reorder the columns\n", + " df1 = df1[desired_order]\n", + " print(\"\\nDataFrame with columns reordered:\")\n", + " print(df1)\n", + "else:\n", + " missing_columns = [col for col in desired_order if col not in df1.columns]\n", + " print(f\"\\nError: The following columns are missing from the DataFrame: {missing_columns}\")\n", + "----------\n", + "\n", + "#7 Convert to Strings\n", + "df1 = df1.applymap(lambda x: x.lower() if isinstance(x, str) else str(x) if x is not None else x)\n", + "---------\n", + "\n", + "# Reset dataframe if needed\n", + "\n", + "df_reset = df.reset_index()\n", + "----------\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "id": "492d06e3-92c7-4105-ac72-536db98d3244", + "metadata": { + "id": "492d06e3-92c7-4105-ac72-536db98d3244" + }, + "outputs": [], + "source": [ + "# Your code goes here\n", + "import pandas as pd\n", + "url1=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "url2=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\"\n", + "url3=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n", + "df1 = pd.read_csv(url1)\n", + "df2 = pd.read_csv(url2)\n", + "df3 = pd.read_csv(url3)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "id": "2e98bc70-e581-45ea-a066-616959d07e29", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 162, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "id": "d20839b0-09c5-4cf4-a089-aaadb12fd7d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsTotal Claim AmountPolicy TypeVehicle Class
0GS98873ArizonaFBachelor323912.47%16061881/0/00633.6Personal AutoFour-Door Car
1CW49887CaliforniaFMaster462680.11%794871141/0/00547.2Special AutoSUV
2MY31220CaliforniaFCollege899704.02%542301121/0/00537.6Personal AutoTwo-Door Car
3UH35128OregonFCollege2580706.30%712102141/1/001027.2Personal AutoLuxury Car
4WH52799ArizonaFCollege380812.21%94903941/0/00451.2Corporate AutoTwo-Door Car
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value Income \\\n", + "0 GS98873 Arizona F Bachelor 323912.47% 16061 \n", + "1 CW49887 California F Master 462680.11% 79487 \n", + "2 MY31220 California F College 899704.02% 54230 \n", + "3 UH35128 Oregon F College 2580706.30% 71210 \n", + "4 WH52799 Arizona F College 380812.21% 94903 \n", + "\n", + " Monthly Premium Auto Number of Open Complaints Total Claim Amount \\\n", + "0 88 1/0/00 633.6 \n", + "1 114 1/0/00 547.2 \n", + "2 112 1/0/00 537.6 \n", + "3 214 1/1/00 1027.2 \n", + "4 94 1/0/00 451.2 \n", + "\n", + " Policy Type Vehicle Class \n", + "0 Personal Auto Four-Door Car \n", + "1 Special Auto SUV \n", + "2 Personal Auto Two-Door Car \n", + "3 Personal Auto Luxury Car \n", + "4 Corporate Auto Two-Door Car " + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "id": "8fa25e52-668f-4dc4-8df9-2e78b2e15313", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'st', 'gender', 'education', 'customer lifetime value',\n", + " 'income', 'monthly premium auto', 'number of open complaints',\n", + " 'policy type', 'vehicle class', 'total claim amount'],\n", + " dtype='object')" + ] + }, + "execution_count": 167, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# --- DATA CLEANING STEPS ---\n", + "#1 Lowercase everything:- pd.Series(df1.columns).apply(lambda col: col.lower()) - on seaparate cells\n", + "#2 Re-order Columns\n", + "# Null Values\n", + "\n", + "df1.columns = pd.Series(df1.columns).apply(lambda col: col.lower())\n", + "df1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "id": "cfe7dd19-629d-415b-ac81-48e62e0cd74f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'st', 'gender', 'education', 'customer lifetime value',\n", + " 'income', 'monthly premium auto', 'number of open complaints',\n", + " 'total claim amount', 'policy type', 'vehicle class'],\n", + " dtype='object')" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# --- DATA CLEANING STEPS ---\n", + "#1 Lowercase everything:- df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)\n", + "#2 Re-order Columns\n", + "# Null Values\n", + "\n", + "df2.columns = pd.Series(df2.columns).apply(lambda col: col.lower())\n", + "df2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "id": "6bcd3d11-8290-4490-b1c7-a1dcf855ff74", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'state', 'customer lifetime value', 'education', 'gender',\n", + " 'income', 'monthly premium auto', 'number of open complaints',\n", + " 'policy type', 'total claim amount', 'vehicle class'],\n", + " dtype='object')" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.columns = pd.Series(df3.columns).apply(lambda col: col.lower())\n", + "df3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "id": "edae10e3-1c37-4c4a-8d45-ef818019e534", + "metadata": {}, + "outputs": [], + "source": [ + "df3 = df3.rename(columns={'state': 'st'})" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "id": "500ffc1f-9752-45c2-8a85-3760d22acb87", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstcustomer lifetime valueeducationgenderincomemonthly premium autonumber of open complaintspolicy typetotal claim amountvehicle class
0SA25987Washington3479.137523High School or BelowM01040Personal Auto499.200000Two-Door Car
1TB86706Arizona2502.637401MasterM0660Personal Auto3.468912Two-Door Car
\n", + "
" + ], + "text/plain": [ + " customer st customer lifetime value education gender \\\n", + "0 SA25987 Washington 3479.137523 High School or Below M \n", + "1 TB86706 Arizona 2502.637401 Master M \n", + "\n", + " income monthly premium auto number of open complaints policy type \\\n", + "0 0 104 0 Personal Auto \n", + "1 0 66 0 Personal Auto \n", + "\n", + " total claim amount vehicle class \n", + "0 499.200000 Two-Door Car \n", + "1 3.468912 Two-Door Car " + ] + }, + "execution_count": 175, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1e26414-f807-4bc2-8b38-7652d1ddd9bd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "9e1fca76-fce2-4b86-9387-161b49ae9960", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerStateGenderCustomer Lifetime ValueEducationIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeTotal Claim AmountVehicle Class
0SA25987WashingtonM3479.137523High School or Below01040Personal Auto499.2Two-Door Car
\n", + "
" + ], + "text/plain": [ + " Customer State Gender Customer Lifetime Value Education \\\n", + "0 SA25987 Washington M 3479.137523 High School or Below \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0 104 0 Personal Auto \n", + "\n", + " Total Claim Amount Vehicle Class \n", + "0 499.2 Two-Door Car " + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "d0d438fb-4852-4676-b4de-1beabf9b8d31", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "DataFrame with columns reordered:\n", + " Customer ST GENDER Education \\\n", + "0 rb50392 washington nan master \n", + "1 qz44356 arizona f bachelor \n", + "2 ai49188 nevada f bachelor \n", + "3 ww63253 california m bachelor \n", + "4 ga49547 washington m high school or below \n", + "... ... ... ... ... \n", + "4003 nan nan nan nan \n", + "4004 nan nan nan nan \n", + "4005 nan nan nan nan \n", + "4006 nan nan nan nan \n", + "4007 nan nan nan nan \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 nan 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 nan nan nan \n", + "4004 nan nan nan \n", + "4005 nan nan nan \n", + "4006 nan nan nan \n", + "4007 nan nan nan \n", + "\n", + " Number of Open Complaints Total Claim Amount Policy Type \\\n", + "0 1/0/00 2.704934 personal auto \n", + "1 1/0/00 1131.464935 personal auto \n", + "2 1/0/00 566.472247 personal auto \n", + "3 1/0/00 529.881344 corporate auto \n", + "4 1/0/00 17.269323 personal auto \n", + "... ... ... ... \n", + "4003 nan nan nan \n", + "4004 nan nan nan \n", + "4005 nan nan nan \n", + "4006 nan nan nan \n", + "4007 nan nan nan \n", + "\n", + " Vehicle Class \n", + "0 four-door car \n", + "1 four-door car \n", + "2 two-door car \n", + "3 suv \n", + "4 four-door car \n", + "... ... \n", + "4003 nan \n", + "4004 nan \n", + "4005 nan \n", + "4006 nan \n", + "4007 nan \n", + "\n", + "[4008 rows x 11 columns]\n" + ] + } + ], + "source": [ + "# Specify the desired column order\n", + "desired_order = ['Customer', 'ST','GENDER', 'Education', 'Customer Lifetime Value', 'Income', 'Monthly Premium Auto','Number of Open Complaints', 'Total Claim Amount', 'Policy Type', 'Vehicle Class'] # Replace with your desired column names,\n", + "\n", + "# Check if all desired columns are in the DataFrame\n", + "if all(column in df1.columns for column in desired_order):\n", + " # Reorder the columns\n", + " df1 = df1[desired_order]\n", + " print(\"\\nDataFrame with columns reordered:\")\n", + " print(df1)\n", + "else:\n", + " missing_columns = [col for col in desired_order if col not in df1.columns]\n", + " print(f\"\\nError: The following columns are missing from the DataFrame: {missing_columns}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "id": "54274993-7e44-4976-aa52-3b41a8831232", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstgendereducationcustomer lifetime valueincomemonthly premium autonumber of open complaintspolicy typevehicle classtotal claim amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
7065LA72316CaliforniaMBachelor23405.9879871941.073.00Personal AutoFour-Door Car198.234764
7066PK87824CaliforniaFCollege3096.51121721604.079.00Corporate AutoFour-Door Car379.200000
7067TD14365CaliforniaMBachelor8163.8904280.085.03Corporate AutoFour-Door Car790.784983
7068UP19263CaliforniaMCollege7524.44243621941.096.00Personal AutoFour-Door Car691.200000
7069Y167826CaliforniaMCollege2611.8368660.077.00Corporate AutoTwo-Door Car369.600000
\n", + "

12074 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer st gender education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "7065 LA72316 California M Bachelor \n", + "7066 PK87824 California F College \n", + "7067 TD14365 California M Bachelor \n", + "7068 UP19263 California M College \n", + "7069 Y167826 California M College \n", + "\n", + " customer lifetime value income monthly premium auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "7065 23405.98798 71941.0 73.0 \n", + "7066 3096.511217 21604.0 79.0 \n", + "7067 8163.890428 0.0 85.0 \n", + "7068 7524.442436 21941.0 96.0 \n", + "7069 2611.836866 0.0 77.0 \n", + "\n", + " number of open complaints policy type vehicle class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "7065 0 Personal Auto Four-Door Car \n", + "7066 0 Corporate Auto Four-Door Car \n", + "7067 3 Corporate Auto Four-Door Car \n", + "7068 0 Personal Auto Four-Door Car \n", + "7069 0 Corporate Auto Two-Door Car \n", + "\n", + " total claim amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "7065 198.234764 \n", + "7066 379.200000 \n", + "7067 790.784983 \n", + "7068 691.200000 \n", + "7069 369.600000 \n", + "\n", + "[12074 rows x 11 columns]" + ] + }, + "execution_count": 177, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2, df3], axis=0)" + ] + }, + { + "cell_type": "markdown", + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57", + "metadata": { + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57" + }, + "source": [ + "# Challenge 2: Structuring Data" + ] + }, + { + "cell_type": "markdown", + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b", + "metadata": { + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b" + }, + "source": [ + "In this challenge, we will continue to work with customer data from an insurance company, but we will use a dataset with more columns, called marketing_customer_analysis.csv, which can be found at the following link:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\n", + "\n", + "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by performing data cleaning, formatting, and structuring." + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", + "metadata": { + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" + }, + "outputs": [], + "source": [ + "# Your code goes here\n", + "url4=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n", + "df4 = pd.read_csv(url4)" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "id": "c1d5fe1e-5478-4da4-906a-87976faf4674", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
\n", + "

2 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response coverage \\\n", + "0 0 DK49336 Arizona 4809.216960 No Basic \n", + "1 1 KX64629 California 2228.525238 No Basic \n", + "\n", + " education effective_to_date employmentstatus gender ... \\\n", + "0 College 2011-02-18 Employed M ... \n", + "1 College 2011-01-18 Unemployed F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "\n", + "[2 rows x 27 columns]" + ] + }, + "execution_count": 191, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df4.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "id": "0dac4374-5c3f-41c5-bcd0-da29057d314c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['unnamed:_0', 'customer', 'state', 'customer_lifetime_value',\n", + " 'response', 'coverage', 'education', 'effective_to_date',\n", + " 'employmentstatus', 'gender', 'income', 'location_code',\n", + " 'marital_status', 'monthly_premium_auto', 'months_since_last_claim',\n", + " 'months_since_policy_inception', 'number_of_open_complaints',\n", + " 'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',\n", + " 'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size',\n", + " 'vehicle_type', 'month'],\n", + " dtype='object')" + ] + }, + "execution_count": 193, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df4.columns = pd.Series(df4.columns).apply(lambda col: col.lower())\n", + "df4.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "0978f14b-1ad3-406b-b1a5-a793b900fa90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
\n", + "

2 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response coverage \\\n", + "0 0 DK49336 Arizona 4809.216960 No Basic \n", + "1 1 KX64629 California 2228.525238 No Basic \n", + "\n", + " education effective_to_date employmentstatus gender ... \\\n", + "0 College 2011-02-18 Employed M ... \n", + "1 College 2011-01-18 Unemployed F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "\n", + "[2 rows x 27 columns]" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df4.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 213, + "id": "039fdbf3-a81f-44e4-b06c-5013ac4c4abe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
income
sales_channel
Agent152490152
Branch113775608
Call Center81055004
Web62200103
\n", + "
" + ], + "text/plain": [ + " income\n", + "sales_channel \n", + "Agent 152490152\n", + "Branch 113775608\n", + "Call Center 81055004\n", + "Web 62200103" + ] + }, + "execution_count": 213, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_df4 = df4.pivot_table(index='sales_channel',values=['income'], aggfunc='sum')\n", + "pivot_df4" + ] + }, + { + "cell_type": "markdown", + "id": "4ad12823-4f6d-4d42-a40c-97b72d3fd5d3", + "metadata": {}, + "source": [ + "The in-person sales method is the most lucrative." + ] + }, + { + "cell_type": "code", + "execution_count": 234, + "id": "6c8abea1-107e-4e66-9331-f0ed90e2c5f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_lifetime_value
genderFM
education
Bachelor13386258.1112110061.83
College12460107.9112851725.02
Doctor1465701.781490482.06
High School or Below13793600.1212680914.19
Master3874600.253365559.06
\n", + "
" + ], + "text/plain": [ + " customer_lifetime_value \n", + "gender F M\n", + "education \n", + "Bachelor 13386258.11 12110061.83\n", + "College 12460107.91 12851725.02\n", + "Doctor 1465701.78 1490482.06\n", + "High School or Below 13793600.12 12680914.19\n", + "Master 3874600.25 3365559.06" + ] + }, + "execution_count": 234, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_df4a = df4.pivot_table(index='education',columns='gender', values=['customer_lifetime_value'], aggfunc='sum')\n", + "pivot_df4a = pivot_df4a.round(2)\n", + "pivot_df4a" + ] + }, + { + "cell_type": "markdown", + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", + "metadata": { + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7" + }, + "source": [ + "1. You work at the marketing department and you want to know which sales channel brought the most sales in terms of total revenue. Using pivot, create a summary table showing the total revenue for each sales channel (branch, call center, web, and mail).\n", + "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "640993b2-a291-436c-a34d-a551144f8196", + "metadata": { + "id": "640993b2-a291-436c-a34d-a551144f8196" + }, + "source": [ + "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", + "metadata": { + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198" + }, + "source": [ + "## Bonus\n", + "\n", + "You work at the customer service department and you want to know which months had the highest number of complaints by policy type category. Create a summary table showing the number of complaints by policy type and month.\n", + "Show it in a long format table." + ] + }, + { + "cell_type": "markdown", + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291", + "metadata": { + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291" + }, + "source": [ + "*In data analysis, a long format table is a way of structuring data in which each observation or measurement is stored in a separate row of the table. The key characteristic of a long format table is that each column represents a single variable, and each row represents a single observation of that variable.*\n", + "\n", + "*More information about long and wide format tables here: https://www.statology.org/long-vs-wide-data/*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a069e0b-b400-470e-904d-d17582191be4", + "metadata": { + "id": "3a069e0b-b400-470e-904d-d17582191be4" + }, + "outputs": [], + "source": [ + "# Your code goes here" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..02133a7 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -36,14 +36,1060 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 123, + "id": "ef00922f-b0ec-418c-87c7-3444acae3faa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 customer\n", + "1 st\n", + "2 gender\n", + "3 education\n", + "4 customer lifetime value\n", + "5 income\n", + "6 monthly premium auto\n", + "7 number of open complaints\n", + "8 policy type\n", + "9 vehicle class\n", + "10 total claim amount\n", + "dtype: object" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# --- DATA CLEANING STEPS ---\n", + "import pandas as pd\n", + "url1=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "url2=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\"\n", + "url3=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n", + "df1 = pd.read_csv(url1)\n", + "df2 = pd.read_csv(url2)\n", + "df3 = pd.read_csv(url3)\n", + "\n", + "----------\n", + "\n", + "#1 Lowercase everything:- \n", + "\n", + "df1.columns = pd.Series(df1.columns).apply(lambda col: col.lower())\n", + "df1.columns\n", + "df2.columns = pd.Series(df2.columns).apply(lambda col: col.lower())\n", + "df2.columns\n", + "df3.columns = pd.Series(df3.columns).apply(lambda col: col.lower())\n", + "df3.columns\n", + "----------\n", + "\n", + "#2 Change / Replace names of columns:-\n", + "\n", + "df = df.rename(columns={'st': 'State'})\n", + "----------\n", + "\n", + "#3 Remove null values:-\n", + "\n", + " # Checking for Null Values\n", + "df.isnull() # Returns a DataFrame with True where values are null\n", + " # Count the number of null values in each column\n", + "df.isna().sum()\n", + " # Dropping rows with Null Values\n", + "df.dropna(inplace=True)\n", + "----------\n", + "\n", + "# 4 Find Unique Values\n", + "\n", + "unique_genders = df['gender'].unique()\n", + "female_variations = ['Femal', 'Female']\n", + "# Replace the variations with 'F'\n", + "df['gender'] = df['gender'].replace(female_variations, 'F')\n", + "print(df)\n", + " # - Also in comprehension\n", + "\n", + "df['gender'] = df['gender'].apply(lambda x: 'F' if x in ('Femal', 'Female') else ('M' if x == 'Male' else x))\n", + "#-or-\n", + "df['education'] = df['education'].apply(lambda x: 'Bachelor' if x == 'Bachelors' else x)\n", + "#-or-\n", + "df['vehicle_class'] = df['vehicle_class'].str.replace('Sports Car', 'Luxury')\n", + "df['vehicle_class'] = df['vehicle_class'].str.replace('Luxury SUV', 'Luxury')\n", + "df['vehicle_class'] = df['vehicle_class'].str.replace('Luxury Car', 'Luxury')\n", + "\n", + "vc = df['vehicle_class'].unique()\n", + "print(vc)\n", + "----------\n", + "\n", + "#5 Combine Tables\n", + " # Concatenate the sales, revenue, and costs DataFrames vertically (along rows)\n", + "pd.concat([df1, df2, df3], axis=0)\n", + " # Concatenate the sales, revenue, and costs DataFrames horizontally (along columns)\n", + "pd.concat([df1, df2, df3], axis=1) # by default is outer, takes the union\n", + "\n", + "#6 Manually Re-order the Columns:-\n", + " # Specify the desired column order - *** after they have been lower cased and standardized\n", + "desired_order = ['customer', 'state','gender', 'education', 'Customer Lifetime Value', 'Income', 'Monthly Premium Auto','Number of Open Complaints', 'Total Claim Amount', 'Policy Type', 'Vehicle Class'] # Replace with your desired column names,\n", + "\n", + " # Check if all desired columns are in the DataFrame\n", + "if all(column in df1.columns for column in desired_order):\n", + " # Reorder the columns\n", + " df1 = df1[desired_order]\n", + " print(\"\\nDataFrame with columns reordered:\")\n", + " print(df1)\n", + "else:\n", + " missing_columns = [col for col in desired_order if col not in df1.columns]\n", + " print(f\"\\nError: The following columns are missing from the DataFrame: {missing_columns}\")\n", + "----------\n", + "\n", + "#7 Convert to Strings\n", + "df1 = df1.applymap(lambda x: x.lower() if isinstance(x, str) else str(x) if x is not None else x)\n", + "---------\n", + "\n", + "# Reset dataframe if needed\n", + "\n", + "df_reset = df.reset_index()\n", + "----------\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 161, "id": "492d06e3-92c7-4105-ac72-536db98d3244", "metadata": { "id": "492d06e3-92c7-4105-ac72-536db98d3244" }, "outputs": [], "source": [ - "# Your code goes here" + "# Your code goes here\n", + "import pandas as pd\n", + "url1=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "url2=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\"\n", + "url3=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n", + "df1 = pd.read_csv(url1)\n", + "df2 = pd.read_csv(url2)\n", + "df3 = pd.read_csv(url3)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "id": "2e98bc70-e581-45ea-a066-616959d07e29", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 162, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "id": "d20839b0-09c5-4cf4-a089-aaadb12fd7d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsTotal Claim AmountPolicy TypeVehicle Class
0GS98873ArizonaFBachelor323912.47%16061881/0/00633.6Personal AutoFour-Door Car
1CW49887CaliforniaFMaster462680.11%794871141/0/00547.2Special AutoSUV
2MY31220CaliforniaFCollege899704.02%542301121/0/00537.6Personal AutoTwo-Door Car
3UH35128OregonFCollege2580706.30%712102141/1/001027.2Personal AutoLuxury Car
4WH52799ArizonaFCollege380812.21%94903941/0/00451.2Corporate AutoTwo-Door Car
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value Income \\\n", + "0 GS98873 Arizona F Bachelor 323912.47% 16061 \n", + "1 CW49887 California F Master 462680.11% 79487 \n", + "2 MY31220 California F College 899704.02% 54230 \n", + "3 UH35128 Oregon F College 2580706.30% 71210 \n", + "4 WH52799 Arizona F College 380812.21% 94903 \n", + "\n", + " Monthly Premium Auto Number of Open Complaints Total Claim Amount \\\n", + "0 88 1/0/00 633.6 \n", + "1 114 1/0/00 547.2 \n", + "2 112 1/0/00 537.6 \n", + "3 214 1/1/00 1027.2 \n", + "4 94 1/0/00 451.2 \n", + "\n", + " Policy Type Vehicle Class \n", + "0 Personal Auto Four-Door Car \n", + "1 Special Auto SUV \n", + "2 Personal Auto Two-Door Car \n", + "3 Personal Auto Luxury Car \n", + "4 Corporate Auto Two-Door Car " + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "id": "8fa25e52-668f-4dc4-8df9-2e78b2e15313", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'st', 'gender', 'education', 'customer lifetime value',\n", + " 'income', 'monthly premium auto', 'number of open complaints',\n", + " 'policy type', 'vehicle class', 'total claim amount'],\n", + " dtype='object')" + ] + }, + "execution_count": 167, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# --- DATA CLEANING STEPS ---\n", + "#1 Lowercase everything:- pd.Series(df1.columns).apply(lambda col: col.lower()) - on seaparate cells\n", + "#2 Re-order Columns\n", + "# Null Values\n", + "\n", + "df1.columns = pd.Series(df1.columns).apply(lambda col: col.lower())\n", + "df1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "id": "cfe7dd19-629d-415b-ac81-48e62e0cd74f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'st', 'gender', 'education', 'customer lifetime value',\n", + " 'income', 'monthly premium auto', 'number of open complaints',\n", + " 'total claim amount', 'policy type', 'vehicle class'],\n", + " dtype='object')" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# --- DATA CLEANING STEPS ---\n", + "#1 Lowercase everything:- df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)\n", + "#2 Re-order Columns\n", + "# Null Values\n", + "\n", + "df2.columns = pd.Series(df2.columns).apply(lambda col: col.lower())\n", + "df2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "id": "6bcd3d11-8290-4490-b1c7-a1dcf855ff74", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'state', 'customer lifetime value', 'education', 'gender',\n", + " 'income', 'monthly premium auto', 'number of open complaints',\n", + " 'policy type', 'total claim amount', 'vehicle class'],\n", + " dtype='object')" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.columns = pd.Series(df3.columns).apply(lambda col: col.lower())\n", + "df3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "id": "edae10e3-1c37-4c4a-8d45-ef818019e534", + "metadata": {}, + "outputs": [], + "source": [ + "df3 = df3.rename(columns={'state': 'st'})" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "id": "500ffc1f-9752-45c2-8a85-3760d22acb87", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstcustomer lifetime valueeducationgenderincomemonthly premium autonumber of open complaintspolicy typetotal claim amountvehicle class
0SA25987Washington3479.137523High School or BelowM01040Personal Auto499.200000Two-Door Car
1TB86706Arizona2502.637401MasterM0660Personal Auto3.468912Two-Door Car
\n", + "
" + ], + "text/plain": [ + " customer st customer lifetime value education gender \\\n", + "0 SA25987 Washington 3479.137523 High School or Below M \n", + "1 TB86706 Arizona 2502.637401 Master M \n", + "\n", + " income monthly premium auto number of open complaints policy type \\\n", + "0 0 104 0 Personal Auto \n", + "1 0 66 0 Personal Auto \n", + "\n", + " total claim amount vehicle class \n", + "0 499.200000 Two-Door Car \n", + "1 3.468912 Two-Door Car " + ] + }, + "execution_count": 175, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1e26414-f807-4bc2-8b38-7652d1ddd9bd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "9e1fca76-fce2-4b86-9387-161b49ae9960", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerStateGenderCustomer Lifetime ValueEducationIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeTotal Claim AmountVehicle Class
0SA25987WashingtonM3479.137523High School or Below01040Personal Auto499.2Two-Door Car
\n", + "
" + ], + "text/plain": [ + " Customer State Gender Customer Lifetime Value Education \\\n", + "0 SA25987 Washington M 3479.137523 High School or Below \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0 104 0 Personal Auto \n", + "\n", + " Total Claim Amount Vehicle Class \n", + "0 499.2 Two-Door Car " + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "d0d438fb-4852-4676-b4de-1beabf9b8d31", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "DataFrame with columns reordered:\n", + " Customer ST GENDER Education \\\n", + "0 rb50392 washington nan master \n", + "1 qz44356 arizona f bachelor \n", + "2 ai49188 nevada f bachelor \n", + "3 ww63253 california m bachelor \n", + "4 ga49547 washington m high school or below \n", + "... ... ... ... ... \n", + "4003 nan nan nan nan \n", + "4004 nan nan nan nan \n", + "4005 nan nan nan nan \n", + "4006 nan nan nan nan \n", + "4007 nan nan nan nan \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 nan 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 nan nan nan \n", + "4004 nan nan nan \n", + "4005 nan nan nan \n", + "4006 nan nan nan \n", + "4007 nan nan nan \n", + "\n", + " Number of Open Complaints Total Claim Amount Policy Type \\\n", + "0 1/0/00 2.704934 personal auto \n", + "1 1/0/00 1131.464935 personal auto \n", + "2 1/0/00 566.472247 personal auto \n", + "3 1/0/00 529.881344 corporate auto \n", + "4 1/0/00 17.269323 personal auto \n", + "... ... ... ... \n", + "4003 nan nan nan \n", + "4004 nan nan nan \n", + "4005 nan nan nan \n", + "4006 nan nan nan \n", + "4007 nan nan nan \n", + "\n", + " Vehicle Class \n", + "0 four-door car \n", + "1 four-door car \n", + "2 two-door car \n", + "3 suv \n", + "4 four-door car \n", + "... ... \n", + "4003 nan \n", + "4004 nan \n", + "4005 nan \n", + "4006 nan \n", + "4007 nan \n", + "\n", + "[4008 rows x 11 columns]\n" + ] + } + ], + "source": [ + "# Specify the desired column order\n", + "desired_order = ['Customer', 'ST','GENDER', 'Education', 'Customer Lifetime Value', 'Income', 'Monthly Premium Auto','Number of Open Complaints', 'Total Claim Amount', 'Policy Type', 'Vehicle Class'] # Replace with your desired column names,\n", + "\n", + "# Check if all desired columns are in the DataFrame\n", + "if all(column in df1.columns for column in desired_order):\n", + " # Reorder the columns\n", + " df1 = df1[desired_order]\n", + " print(\"\\nDataFrame with columns reordered:\")\n", + " print(df1)\n", + "else:\n", + " missing_columns = [col for col in desired_order if col not in df1.columns]\n", + " print(f\"\\nError: The following columns are missing from the DataFrame: {missing_columns}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "id": "54274993-7e44-4976-aa52-3b41a8831232", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstgendereducationcustomer lifetime valueincomemonthly premium autonumber of open complaintspolicy typevehicle classtotal claim amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
7065LA72316CaliforniaMBachelor23405.9879871941.073.00Personal AutoFour-Door Car198.234764
7066PK87824CaliforniaFCollege3096.51121721604.079.00Corporate AutoFour-Door Car379.200000
7067TD14365CaliforniaMBachelor8163.8904280.085.03Corporate AutoFour-Door Car790.784983
7068UP19263CaliforniaMCollege7524.44243621941.096.00Personal AutoFour-Door Car691.200000
7069Y167826CaliforniaMCollege2611.8368660.077.00Corporate AutoTwo-Door Car369.600000
\n", + "

12074 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer st gender education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "7065 LA72316 California M Bachelor \n", + "7066 PK87824 California F College \n", + "7067 TD14365 California M Bachelor \n", + "7068 UP19263 California M College \n", + "7069 Y167826 California M College \n", + "\n", + " customer lifetime value income monthly premium auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "7065 23405.98798 71941.0 73.0 \n", + "7066 3096.511217 21604.0 79.0 \n", + "7067 8163.890428 0.0 85.0 \n", + "7068 7524.442436 21941.0 96.0 \n", + "7069 2611.836866 0.0 77.0 \n", + "\n", + " number of open complaints policy type vehicle class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "7065 0 Personal Auto Four-Door Car \n", + "7066 0 Corporate Auto Four-Door Car \n", + "7067 3 Corporate Auto Four-Door Car \n", + "7068 0 Personal Auto Four-Door Car \n", + "7069 0 Corporate Auto Two-Door Car \n", + "\n", + " total claim amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "7065 198.234764 \n", + "7066 379.200000 \n", + "7067 790.784983 \n", + "7068 691.200000 \n", + "7069 369.600000 \n", + "\n", + "[12074 rows x 11 columns]" + ] + }, + "execution_count": 177, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2, df3], axis=0)" ] }, { @@ -72,14 +1118,498 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 189, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, "outputs": [], "source": [ - "# Your code goes here" + "# Your code goes here\n", + "url4=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n", + "df4 = pd.read_csv(url4)" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "id": "c1d5fe1e-5478-4da4-906a-87976faf4674", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
\n", + "

2 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response coverage \\\n", + "0 0 DK49336 Arizona 4809.216960 No Basic \n", + "1 1 KX64629 California 2228.525238 No Basic \n", + "\n", + " education effective_to_date employmentstatus gender ... \\\n", + "0 College 2011-02-18 Employed M ... \n", + "1 College 2011-01-18 Unemployed F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "\n", + "[2 rows x 27 columns]" + ] + }, + "execution_count": 191, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df4.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "id": "0dac4374-5c3f-41c5-bcd0-da29057d314c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['unnamed:_0', 'customer', 'state', 'customer_lifetime_value',\n", + " 'response', 'coverage', 'education', 'effective_to_date',\n", + " 'employmentstatus', 'gender', 'income', 'location_code',\n", + " 'marital_status', 'monthly_premium_auto', 'months_since_last_claim',\n", + " 'months_since_policy_inception', 'number_of_open_complaints',\n", + " 'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',\n", + " 'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size',\n", + " 'vehicle_type', 'month'],\n", + " dtype='object')" + ] + }, + "execution_count": 193, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df4.columns = pd.Series(df4.columns).apply(lambda col: col.lower())\n", + "df4.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "0978f14b-1ad3-406b-b1a5-a793b900fa90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
\n", + "

2 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response coverage \\\n", + "0 0 DK49336 Arizona 4809.216960 No Basic \n", + "1 1 KX64629 California 2228.525238 No Basic \n", + "\n", + " education effective_to_date employmentstatus gender ... \\\n", + "0 College 2011-02-18 Employed M ... \n", + "1 College 2011-01-18 Unemployed F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "\n", + "[2 rows x 27 columns]" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df4.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 213, + "id": "039fdbf3-a81f-44e4-b06c-5013ac4c4abe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
income
sales_channel
Agent152490152
Branch113775608
Call Center81055004
Web62200103
\n", + "
" + ], + "text/plain": [ + " income\n", + "sales_channel \n", + "Agent 152490152\n", + "Branch 113775608\n", + "Call Center 81055004\n", + "Web 62200103" + ] + }, + "execution_count": 213, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_df4 = df4.pivot_table(index='sales_channel',values=['income'], aggfunc='sum')\n", + "pivot_df4" + ] + }, + { + "cell_type": "markdown", + "id": "4ad12823-4f6d-4d42-a40c-97b72d3fd5d3", + "metadata": {}, + "source": [ + "The in-person sales method is the most lucrative." + ] + }, + { + "cell_type": "code", + "execution_count": 234, + "id": "6c8abea1-107e-4e66-9331-f0ed90e2c5f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_lifetime_value
genderFM
education
Bachelor13386258.1112110061.83
College12460107.9112851725.02
Doctor1465701.781490482.06
High School or Below13793600.1212680914.19
Master3874600.253365559.06
\n", + "
" + ], + "text/plain": [ + " customer_lifetime_value \n", + "gender F M\n", + "education \n", + "Bachelor 13386258.11 12110061.83\n", + "College 12460107.91 12851725.02\n", + "Doctor 1465701.78 1490482.06\n", + "High School or Below 13793600.12 12680914.19\n", + "Master 3874600.25 3365559.06" + ] + }, + "execution_count": 234, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_df4a = df4.pivot_table(index='education',columns='gender', values=['customer_lifetime_value'], aggfunc='sum')\n", + "pivot_df4a = pivot_df4a.round(2)\n", + "pivot_df4a" ] }, { @@ -146,9 +1676,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python [conda env:base] *", "language": "python", - "name": "python3" + "name": "conda-base-py" }, "language_info": { "codemirror_mode": { @@ -160,7 +1690,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.7" } }, "nbformat": 4,