You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
- Handling, cleaning, and preparing data.
- Selecting and engineering features.
- Learning by fitting a model to data.
- Optimizing a cost function.
- Selecting a model and tuning hyperparameters using cross-validation.
- Underfitting and overfitting (the bias/variance tradeoff).
- Unsupervised learning techniques: clustering, density estimation and anomaly detection.
- Algorithms: Linear and Polynomial Regression, Logistic Regression, k-Nearest Neighbors, Support Vector Machines, Decision Trees, Random Forests, and Ensemble methods.
# pivot_table() vs groupby(), the below lines are the samepd.pivot_table(df, index=["a"], columns=["b"], values=["c"], aggfunc=np.sum)
df.groupby(['a','b'])['c'].sum()
# Aggregate using one or more operations over the specified axis# agg()-can be applied to multiple groups togetherdf.agg(['sum', 'min'])
df_all.groupby(['Sex', 'Pclass']).agg(lambdax:x.value_counts().index[0])['Embarked']
# Apply a function along an axis of the DataFrame# apply()-cannot be applied to multiple groups together df.apply(np.sqrt)
df_all['Deck'] =df_all['Cabin'].apply(lambdas: s[0] ifpd.notnull(s) else'M')
housing.dropna(subset=["total_bedrooms"]) # Get rid of the corresponding districtshousing.drop("total_bedrooms", axis=1) # Get rid of the whole attributemedian=housing["total_bedrooms"].median() # Set the values to some value (zero, mean, median)housing["total_bedrooms"].fillna(median, inplace=True)
'''SimpleImputer, filling with the missing numerical attributes with the "median"'''fromsklearn.imputeimportSimpleImputerimputer=SimpleImputer(strategy="median")
housing_num=housing.select_dtypes(include=[np.number]) # just numerical attributesimputer.fit(housing_num) # "trained" inputer, now it is ready to transform the training set by replacing missing values with the learned mediansimputer.statistics_# same as "housing_num.median().values"X=imputer.transform(housing_num)
housing_tr=pd.DataFrame(X, columns=housing_num.columns,
index=housing.index) # new dataframe
'''Save the model'''importjoblibjoblib.dump(my_model, "my_model.pkl") # to save modelmy_model_loaded=joblib.load("my_model.pkl") # to load model
Fine-tune Models
Grid Search
fromsklearn.model_selectionimportGridSearchCVparam_grid= [
# try 12 (3Γ4) combinations of hyperparameters
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
# then try 6 (2Γ3) combinations with bootstrap set as False
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg=RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training grid_search=GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_params_# the best hyperparametersgrid_search.best_estimator_# look at the score of each hyperparameter combination tested during the grid search:cvres=grid_search.cv_results_formean_score, paramsinzip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
Randomized Search
fromsklearn.model_selectionimportRandomizedSearchCVfromscipy.statsimportrandintparam_distribs= {
'n_estimators': randint(low=1, high=200),
'max_features': randint(low=1, high=8),
}
forest_reg=RandomForestRegressor(random_state=42)
rnd_search=RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)
# looking at the scores during trainingcvres=rnd_search.cv_results_formean_score, paramsinzip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
feature_importances=grid_search.best_estimator_.feature_importances_