Skip to content

Commit 92e3023

Browse files
author
Jacob pc
committed
Merge remote-tracking branch 'origin/main'
2 parents e3cfac7 + 49821c3 commit 92e3023

21 files changed

+1522
-55
lines changed

__pycache__/project1.cpython-39.pyc

3.98 KB
Binary file not shown.
Lines changed: 104 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,86 +1,58 @@
1+
#Imports
12
import numpy as np
23
import pandas as pd
34
import matplotlib.pyplot as plt
45
from scipy.linalg import svd
56
import seaborn as sns
6-
from sklearn import decomposition
7-
7+
from scipy.stats import boxcox
88

9+
#Loading data
910
filename = 'Weather Training Data.csv'
1011
df = pd.read_csv(filename)
1112

12-
df = df.loc[df['Location'] == 'Canberra']
13+
df = df.loc[df['Location'] == 'Sydney']
1314

1415
df = df[["RainToday", "MinTemp", "MaxTemp", "Evaporation", "Sunshine", "WindGustSpeed", "Humidity9am", "Pressure9am",
1516
"Cloud9am", "Temp9am", "Rainfall"]]
1617

1718

1819
print(df)
1920

21+
#Looking at data for missing values
22+
print("Data and its number of missing values.")
2023
print(df.isnull().sum())
24+
2125
# We remove all the places where RainToday is zero
2226
df = df.dropna(subset=["RainToday"])
2327

28+
print("Data with removed RainToday data points.")
2429
print(df.isnull().sum())
2530

2631
# We insert the mean on all NaN's in the dataset
2732
for x in list(df.columns.values)[1:]:
2833
df[x] = df[x].fillna(df[x].mean())
2934

35+
print("Data with modified mean values.")
3036
print(df.isnull().sum())
3137

3238
# We turn Yes and No into binary
3339
df.loc[df.RainToday == "Yes", "RainToday"] = 1
3440
df.loc[df.RainToday == "No", "RainToday"] = 0
3541

42+
print("Data with binary modified RainToday")
3643
print(df.head())
3744

38-
# We turn the dataset into numpy array
39-
40-
X = df[["MinTemp", "MaxTemp", "WindGustSpeed", "Humidity9am", "Pressure9am",
41-
"Cloud9am", "Temp9am", "Rainfall"]].to_numpy()
42-
print(X.shape)
43-
44-
#for x in ["MinTemp", "MaxTemp", "WindGustSpeed", "Humidity9am", "Pressure9am",
45-
# "Cloud9am", "Temp9am", "Rainfall"]:
46-
# idx = 0
47-
# plt.hist()
48-
49-
50-
51-
# Subtract mean value from data
52-
Y = X - np.ones((2380, 1)) * X.mean(axis=0)
53-
54-
# PCA by computing SVD of Y
55-
U, S, Vh = svd(Y, full_matrices=False)
56-
57-
# Compute variance explained by principal components
58-
rho = (S * S) / (S * S).sum()
59-
60-
threshold90 = 0.9
61-
threshold95 = 0.95
62-
63-
# Plot variance explained
64-
plt.figure()
65-
plt.plot(range(1, len(rho) + 1), rho, 'x-')
66-
plt.plot(range(1, len(rho) + 1), np.cumsum(rho), 'o-')
67-
plt.plot([1, len(rho)], [threshold90, threshold90], 'k--')
68-
plt.plot([1, len(rho)], [threshold95, threshold95], 'r--')
69-
plt.title('Variance explained by principal components');
70-
plt.xlabel('Principal component');
71-
plt.ylabel('Variance explained');
72-
plt.legend(['Individual', 'Cumulative', 'Threshold 90', 'Threshold 95'])
73-
plt.grid()
74-
plt.show()
75-
76-
77-
# We also want to do the correlation between the attributes
78-
sns.displot(df, x="MinTemp", kde=True)
45+
sns.displot(df, x='MinTemp', kde=True)
7946
plt.title("Minimum temperature distribution")
8047
plt.show()
8148

8249
sns.displot(df, x="MaxTemp", kde=True)
83-
plt.title("Maximum temperature distribution", y=1.0, pad=-14)
50+
plt.title("Maximum temperature distribution")
51+
plt.show()
52+
53+
target = np.log(df['MaxTemp'])
54+
sns.displot(data=target, kde=True)
55+
plt.title("Log Transformed Maximum temperature distribution")
8456
plt.show()
8557

8658
sns.displot(df, x="WindGustSpeed", kde=True)
@@ -91,6 +63,11 @@
9163
plt.title("Humidity at 9 am distribution")
9264
plt.show()
9365

66+
target = np.square(df['Humidity9am'])
67+
sns.displot(data=target, kde=True)
68+
plt.title("x-squared Transformed Humidity at 9 am distribution")
69+
plt.show()
70+
9471
sns.displot(df, x="Pressure9am", kde=True)
9572
plt.title("Pressure at 9 am distribution")
9673
plt.show()
@@ -103,31 +80,103 @@
10380
plt.title("Temperature at 9 am distribution")
10481
plt.show()
10582

106-
sns.displot(df, x="Rainfall", kde=True)
107-
plt.title("Rainfall during the day distribution")
108-
plt.show()
83+
# sns.displot(df, x="Rainfall", kde=True)
84+
# plt.title("Rainfall during the day distribution")
85+
# plt.show()
86+
87+
# target = np.log(df['Rainfall'])
88+
# sns.displot(data=target, kde=True)
89+
# plt.title("Log Transformed Rainfall during the day distribution")
90+
# plt.show()
10991

11092
sns.displot(df, x="Evaporation", kde=True)
11193
plt.title("Evaporation distribution")
11294
plt.show()
11395

96+
target = np.sqrt(df['Evaporation'])
97+
sns.displot(data=target, kde=True)
98+
plt.title("Square root Transformed Evaporation distribution")
99+
plt.show()
100+
114101
sns.displot(df, x="Sunshine", kde=True)
115102
plt.title("Sunshine distribution")
116103
plt.show()
117104

105+
#We want to transform the data:
106+
print(df.head())
107+
108+
#We transform by the following operations:
109+
df_trans = df.copy()
110+
df_trans['Humidity9am'] = df_trans['Humidity9am'].transform(np.sqrt)
111+
df_trans['Evaporation'] = df_trans['Evaporation'].transform(np.sqrt)
112+
df_trans['MaxTemp'] = df_trans['MaxTemp'].transform(np.log)
113+
114+
#And get the following data:
115+
print(df_trans.head())
116+
117+
#PCA
118+
# We turn the dataset into numpy array
119+
X = df_trans[["MinTemp", "MaxTemp", "Evaporation", "Sunshine", "WindGustSpeed", "Humidity9am", "Pressure9am",
120+
"Cloud9am", "Temp9am"]].to_numpy()
121+
N, M = X.shape
122+
print(f"Shape of data as numpy array: {N,M}")
123+
124+
# Subtract mean value from data
125+
Y = X - np.ones((N, 1)) * X.mean(axis=0)
126+
127+
# PCA by computing SVD of Y
128+
U, S, Vh = svd(Y, full_matrices=False)
129+
V = Vh.T
130+
131+
# Compute variance explained by principal components
132+
rho = (S * S) / (S * S).sum()
133+
#Explained variance
134+
#Different threshold values
135+
threshold90 = 0.9
136+
threshold95 = 0.95
137+
138+
# Plot variance explained
139+
plt.figure()
140+
plt.plot(range(1, len(rho) + 1), rho, 'x-')
141+
plt.plot(range(1, len(rho) + 1), np.cumsum(rho), 'o-')
142+
plt.plot([1, len(rho)], [threshold90, threshold90], 'k--')
143+
plt.plot([1, len(rho)], [threshold95, threshold95], 'r--')
144+
plt.title('Variance explained by principal components');
145+
plt.xlabel('Principal component');
146+
plt.ylabel('Variance explained');
147+
plt.legend(['Individual', 'Cumulative', 'Threshold 90', 'Threshold 95'])
148+
plt.grid()
149+
plt.show()
118150

119-
#sns.pairplot(df[["MinTemp", "MaxTemp", "WindGustSpeed", "Humidity9am", "Pressure9am","Cloud9am", "Temp9am", "Rainfall"]])
120-
#plt.show()
151+
# We also want to do the correlation between the attributes
121152

122153
# We want to find the correlation
123-
print(df[["MinTemp", "MaxTemp", "Evaporation", "Sunshine", "WindGustSpeed", "Humidity9am", "Pressure9am",
124-
"Cloud9am", "Temp9am", "Rainfall"]].corr())
154+
corr = df_trans[["MinTemp", "MaxTemp", "Evaporation", "Sunshine", "WindGustSpeed", "Humidity9am", "Pressure9am",
155+
"Cloud9am", "Temp9am"]].corr()
125156

126-
sns.heatmap(df[["MinTemp", "MaxTemp", "Evaporation", "Sunshine", "WindGustSpeed", "Humidity9am", "Pressure9am",
127-
"Cloud9am", "Temp9am", "Rainfall"]].corr(), annot=True)
157+
sns.heatmap(corr, annot=True)
128158
plt.xticks(rotation=45)
129159
plt.show()
130160

161+
#Principal directions
162+
pcs = [0,1,2,3]
163+
legendStrs = ['PC'+str(e+1) for e in pcs]
164+
c = ['r','g','b']
165+
attributeNames = ["MinTemp", "MaxTemp", "Evaporation", "Sunshine", "WindGustSpeed", "Humidity9am", "Pressure9am",
166+
"Cloud9am", "Temp9am"]
167+
bw = .2
168+
r = np.arange(1,M+1)
169+
for i in pcs:
170+
plt.bar(r+i*bw, V[:,i], width=bw)
171+
plt.xticks(r+bw, attributeNames, rotation = 45)
172+
plt.xlabel('Attributes')
173+
plt.ylabel('Component coefficients')
174+
plt.legend(legendStrs)
175+
plt.grid()
176+
plt.title('PCA Component Coefficients')
177+
plt.show()
178+
179+
#PC plots
131180

132181
# scipy.linalg.svd returns "Vh", which is the Hermitian (transpose)
133182
# of the vector V. So, for us to obtain the correct V, we transpose:
@@ -154,4 +203,4 @@
154203
location += 1
155204

156205
# Output result to screen
157-
plt.show()
206+
plt.show()

project2.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#Imports
2+
import numpy as np
3+
import pandas as pd
4+
import matplotlib.pyplot as plt
5+
from scipy.linalg import svd
6+
import seaborn as sns
7+
from scipy.stats import boxcox
8+
from project1 import df_trans
9+
from sklearn import preprocessing
10+
11+
#Define target and traning variables (without WindGustSpeed as per conclusion of last report)
12+
target_reg = df_trans["Rainfall"]
13+
target_class = df_trans["RainToday"]
14+
var = df_trans.drop(["WindGustSpeed","Rainfall", "RainToday"], axis=1)
15+
16+
#Standardize data
17+
var_scaled = preprocessing.scale(var)

0 commit comments

Comments
 (0)