1
+ #Imports
1
2
import numpy as np
2
3
import pandas as pd
3
4
import matplotlib .pyplot as plt
4
5
from scipy .linalg import svd
5
6
import seaborn as sns
6
- from sklearn import decomposition
7
-
7
+ from scipy .stats import boxcox
8
8
9
+ #Loading data
9
10
filename = 'Weather Training Data.csv'
10
11
df = pd .read_csv (filename )
11
12
12
- df = df .loc [df ['Location' ] == 'Canberra ' ]
13
+ df = df .loc [df ['Location' ] == 'Sydney ' ]
13
14
14
15
df = df [["RainToday" , "MinTemp" , "MaxTemp" , "Evaporation" , "Sunshine" , "WindGustSpeed" , "Humidity9am" , "Pressure9am" ,
15
16
"Cloud9am" , "Temp9am" , "Rainfall" ]]
16
17
17
18
18
19
print (df )
19
20
21
+ #Looking at data for missing values
22
+ print ("Data and its number of missing values." )
20
23
print (df .isnull ().sum ())
24
+
21
25
# We remove all the places where RainToday is zero
22
26
df = df .dropna (subset = ["RainToday" ])
23
27
28
+ print ("Data with removed RainToday data points." )
24
29
print (df .isnull ().sum ())
25
30
26
31
# We insert the mean on all NaN's in the dataset
27
32
for x in list (df .columns .values )[1 :]:
28
33
df [x ] = df [x ].fillna (df [x ].mean ())
29
34
35
+ print ("Data with modified mean values." )
30
36
print (df .isnull ().sum ())
31
37
32
38
# We turn Yes and No into binary
33
39
df .loc [df .RainToday == "Yes" , "RainToday" ] = 1
34
40
df .loc [df .RainToday == "No" , "RainToday" ] = 0
35
41
42
+ print ("Data with binary modified RainToday" )
36
43
print (df .head ())
37
44
38
- # We turn the dataset into numpy array
39
-
40
- X = df [["MinTemp" , "MaxTemp" , "WindGustSpeed" , "Humidity9am" , "Pressure9am" ,
41
- "Cloud9am" , "Temp9am" , "Rainfall" ]].to_numpy ()
42
- print (X .shape )
43
-
44
- #for x in ["MinTemp", "MaxTemp", "WindGustSpeed", "Humidity9am", "Pressure9am",
45
- # "Cloud9am", "Temp9am", "Rainfall"]:
46
- # idx = 0
47
- # plt.hist()
48
-
49
-
50
-
51
- # Subtract mean value from data
52
- Y = X - np .ones ((2380 , 1 )) * X .mean (axis = 0 )
53
-
54
- # PCA by computing SVD of Y
55
- U , S , Vh = svd (Y , full_matrices = False )
56
-
57
- # Compute variance explained by principal components
58
- rho = (S * S ) / (S * S ).sum ()
59
-
60
- threshold90 = 0.9
61
- threshold95 = 0.95
62
-
63
- # Plot variance explained
64
- plt .figure ()
65
- plt .plot (range (1 , len (rho ) + 1 ), rho , 'x-' )
66
- plt .plot (range (1 , len (rho ) + 1 ), np .cumsum (rho ), 'o-' )
67
- plt .plot ([1 , len (rho )], [threshold90 , threshold90 ], 'k--' )
68
- plt .plot ([1 , len (rho )], [threshold95 , threshold95 ], 'r--' )
69
- plt .title ('Variance explained by principal components' );
70
- plt .xlabel ('Principal component' );
71
- plt .ylabel ('Variance explained' );
72
- plt .legend (['Individual' , 'Cumulative' , 'Threshold 90' , 'Threshold 95' ])
73
- plt .grid ()
74
- plt .show ()
75
-
76
-
77
- # We also want to do the correlation between the attributes
78
- sns .displot (df , x = "MinTemp" , kde = True )
45
+ sns .displot (df , x = 'MinTemp' , kde = True )
79
46
plt .title ("Minimum temperature distribution" )
80
47
plt .show ()
81
48
82
49
sns .displot (df , x = "MaxTemp" , kde = True )
83
- plt .title ("Maximum temperature distribution" , y = 1.0 , pad = - 14 )
50
+ plt .title ("Maximum temperature distribution" )
51
+ plt .show ()
52
+
53
+ target = np .log (df ['MaxTemp' ])
54
+ sns .displot (data = target , kde = True )
55
+ plt .title ("Log Transformed Maximum temperature distribution" )
84
56
plt .show ()
85
57
86
58
sns .displot (df , x = "WindGustSpeed" , kde = True )
91
63
plt .title ("Humidity at 9 am distribution" )
92
64
plt .show ()
93
65
66
+ target = np .square (df ['Humidity9am' ])
67
+ sns .displot (data = target , kde = True )
68
+ plt .title ("x-squared Transformed Humidity at 9 am distribution" )
69
+ plt .show ()
70
+
94
71
sns .displot (df , x = "Pressure9am" , kde = True )
95
72
plt .title ("Pressure at 9 am distribution" )
96
73
plt .show ()
103
80
plt .title ("Temperature at 9 am distribution" )
104
81
plt .show ()
105
82
106
- sns .displot (df , x = "Rainfall" , kde = True )
107
- plt .title ("Rainfall during the day distribution" )
108
- plt .show ()
83
+ # sns.displot(df, x="Rainfall", kde=True)
84
+ # plt.title("Rainfall during the day distribution")
85
+ # plt.show()
86
+
87
+ # target = np.log(df['Rainfall'])
88
+ # sns.displot(data=target, kde=True)
89
+ # plt.title("Log Transformed Rainfall during the day distribution")
90
+ # plt.show()
109
91
110
92
sns .displot (df , x = "Evaporation" , kde = True )
111
93
plt .title ("Evaporation distribution" )
112
94
plt .show ()
113
95
96
+ target = np .sqrt (df ['Evaporation' ])
97
+ sns .displot (data = target , kde = True )
98
+ plt .title ("Square root Transformed Evaporation distribution" )
99
+ plt .show ()
100
+
114
101
sns .displot (df , x = "Sunshine" , kde = True )
115
102
plt .title ("Sunshine distribution" )
116
103
plt .show ()
117
104
105
+ #We want to transform the data:
106
+ print (df .head ())
107
+
108
+ #We transform by the following operations:
109
+ df_trans = df .copy ()
110
+ df_trans ['Humidity9am' ] = df_trans ['Humidity9am' ].transform (np .sqrt )
111
+ df_trans ['Evaporation' ] = df_trans ['Evaporation' ].transform (np .sqrt )
112
+ df_trans ['MaxTemp' ] = df_trans ['MaxTemp' ].transform (np .log )
113
+
114
+ #And get the following data:
115
+ print (df_trans .head ())
116
+
117
+ #PCA
118
+ # We turn the dataset into numpy array
119
+ X = df_trans [["MinTemp" , "MaxTemp" , "Evaporation" , "Sunshine" , "WindGustSpeed" , "Humidity9am" , "Pressure9am" ,
120
+ "Cloud9am" , "Temp9am" ]].to_numpy ()
121
+ N , M = X .shape
122
+ print (f"Shape of data as numpy array: { N ,M } " )
123
+
124
+ # Subtract mean value from data
125
+ Y = X - np .ones ((N , 1 )) * X .mean (axis = 0 )
126
+
127
+ # PCA by computing SVD of Y
128
+ U , S , Vh = svd (Y , full_matrices = False )
129
+ V = Vh .T
130
+
131
+ # Compute variance explained by principal components
132
+ rho = (S * S ) / (S * S ).sum ()
133
+ #Explained variance
134
+ #Different threshold values
135
+ threshold90 = 0.9
136
+ threshold95 = 0.95
137
+
138
+ # Plot variance explained
139
+ plt .figure ()
140
+ plt .plot (range (1 , len (rho ) + 1 ), rho , 'x-' )
141
+ plt .plot (range (1 , len (rho ) + 1 ), np .cumsum (rho ), 'o-' )
142
+ plt .plot ([1 , len (rho )], [threshold90 , threshold90 ], 'k--' )
143
+ plt .plot ([1 , len (rho )], [threshold95 , threshold95 ], 'r--' )
144
+ plt .title ('Variance explained by principal components' );
145
+ plt .xlabel ('Principal component' );
146
+ plt .ylabel ('Variance explained' );
147
+ plt .legend (['Individual' , 'Cumulative' , 'Threshold 90' , 'Threshold 95' ])
148
+ plt .grid ()
149
+ plt .show ()
118
150
119
- #sns.pairplot(df[["MinTemp", "MaxTemp", "WindGustSpeed", "Humidity9am", "Pressure9am","Cloud9am", "Temp9am", "Rainfall"]])
120
- #plt.show()
151
+ # We also want to do the correlation between the attributes
121
152
122
153
# We want to find the correlation
123
- print ( df [["MinTemp" , "MaxTemp" , "Evaporation" , "Sunshine" , "WindGustSpeed" , "Humidity9am" , "Pressure9am" ,
124
- "Cloud9am" , "Temp9am" , "Rainfall" ]].corr () )
154
+ corr = df_trans [["MinTemp" , "MaxTemp" , "Evaporation" , "Sunshine" , "WindGustSpeed" , "Humidity9am" , "Pressure9am" ,
155
+ "Cloud9am" , "Temp9am" ]].corr ()
125
156
126
- sns .heatmap (df [["MinTemp" , "MaxTemp" , "Evaporation" , "Sunshine" , "WindGustSpeed" , "Humidity9am" , "Pressure9am" ,
127
- "Cloud9am" , "Temp9am" , "Rainfall" ]].corr (), annot = True )
157
+ sns .heatmap (corr , annot = True )
128
158
plt .xticks (rotation = 45 )
129
159
plt .show ()
130
160
161
+ #Principal directions
162
+ pcs = [0 ,1 ,2 ,3 ]
163
+ legendStrs = ['PC' + str (e + 1 ) for e in pcs ]
164
+ c = ['r' ,'g' ,'b' ]
165
+ attributeNames = ["MinTemp" , "MaxTemp" , "Evaporation" , "Sunshine" , "WindGustSpeed" , "Humidity9am" , "Pressure9am" ,
166
+ "Cloud9am" , "Temp9am" ]
167
+ bw = .2
168
+ r = np .arange (1 ,M + 1 )
169
+ for i in pcs :
170
+ plt .bar (r + i * bw , V [:,i ], width = bw )
171
+ plt .xticks (r + bw , attributeNames , rotation = 45 )
172
+ plt .xlabel ('Attributes' )
173
+ plt .ylabel ('Component coefficients' )
174
+ plt .legend (legendStrs )
175
+ plt .grid ()
176
+ plt .title ('PCA Component Coefficients' )
177
+ plt .show ()
178
+
179
+ #PC plots
131
180
132
181
# scipy.linalg.svd returns "Vh", which is the Hermitian (transpose)
133
182
# of the vector V. So, for us to obtain the correct V, we transpose:
154
203
location += 1
155
204
156
205
# Output result to screen
157
- plt .show ()
206
+ plt .show ()
0 commit comments