import os
import pandas as pd

path1 = 'archive/student-mat.csv'
path2 = 'archive/student-por.csv'
def load_data(path1=path1, path2=path2):
    dataframes = [pd.read_csv(path1), pd.read_csv(path1)]
    return pd.concat(dataframes)


students = load_data()


students.head()


students.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 790 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      790 non-null    object
 1   sex         790 non-null    object
 2   age         790 non-null    int64 
 3   address     790 non-null    object
 4   famsize     790 non-null    object
 5   Pstatus     790 non-null    object
 6   Medu        790 non-null    int64 
 7   Fedu        790 non-null    int64 
 8   Mjob        790 non-null    object
 9   Fjob        790 non-null    object
 10  reason      790 non-null    object
 11  guardian    790 non-null    object
 12  traveltime  790 non-null    int64 
 13  studytime   790 non-null    int64 
 14  failures    790 non-null    int64 
 15  schoolsup   790 non-null    object
 16  famsup      790 non-null    object
 17  paid        790 non-null    object
 18  activities  790 non-null    object
 19  nursery     790 non-null    object
 20  higher      790 non-null    object
 21  internet    790 non-null    object
 22  romantic    790 non-null    object
 23  famrel      790 non-null    int64 
 24  freetime    790 non-null    int64 
 25  goout       790 non-null    int64 
 26  Dalc        790 non-null    int64 
 27  Walc        790 non-null    int64 
 28  health      790 non-null    int64 
 29  absences    790 non-null    int64 
 30  G1          790 non-null    int64 
 31  G2          790 non-null    int64 
 32  G3          790 non-null    int64 
dtypes: int64(16), object(17)
memory usage: 209.8+ KB


students.describe()


%matplotlib inline
import matplotlib.pyplot as plt
students.hist(bins=50, figsize=(20,15))
plt.show()


students['Talc'] = students['Dalc'] + students['Walc']
students = students.drop(columns=['Dalc', 'Walc'])


corr_matrix = students.corr()
corr_matrix['Talc'].sort_values(ascending=False)

Talc          1.000000
goout         0.392683
freetime      0.189754
failures      0.153203
traveltime    0.149134
absences      0.138687
age           0.134972
health        0.094662
Fedu         -0.007127
Medu         -0.021681
G3           -0.058245
G2           -0.083901
famrel       -0.108427
G1           -0.124158
studytime    -0.252698
Name: Talc, dtype: float64


students['goout'].hist()

<AxesSubplot:>


from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(students, students['goout']):
    strat_train_set = students.iloc[train_index]
    strat_test_set = students.iloc[test_index]


strat_test_set['goout'].value_counts()/len(strat_test_set)

3    0.329114
2    0.259494
4    0.221519
5    0.132911
1    0.056962
Name: goout, dtype: float64


students['goout'].value_counts()/len(students)

3    0.329114
2    0.260759
4    0.217722
5    0.134177
1    0.058228
Name: goout, dtype: float64


students = strat_train_set.copy()


corr_matrix = students.corr()
corr_matrix['Talc'].sort_values(ascending=False)

Talc          1.000000
goout         0.372098
freetime      0.177675
traveltime    0.165437
absences      0.149713
failures      0.140723
health        0.129867
age           0.119320
Fedu         -0.003216
Medu         -0.011320
G3           -0.059093
G2           -0.083255
G1           -0.123448
famrel       -0.143482
studytime    -0.264073
Name: Talc, dtype: float64


from pandas.plotting import scatter_matrix

attributes = ['Talc', 'goout', 'freetime', 'failures', 'studytime', 'G1']
scatter_matrix(students[attributes], figsize=(12,8), alpha=0.05)

array([[<AxesSubplot:xlabel='Talc', ylabel='Talc'>,
        <AxesSubplot:xlabel='goout', ylabel='Talc'>,
        <AxesSubplot:xlabel='freetime', ylabel='Talc'>,
        <AxesSubplot:xlabel='failures', ylabel='Talc'>,
        <AxesSubplot:xlabel='studytime', ylabel='Talc'>,
        <AxesSubplot:xlabel='G1', ylabel='Talc'>],
       [<AxesSubplot:xlabel='Talc', ylabel='goout'>,
        <AxesSubplot:xlabel='goout', ylabel='goout'>,
        <AxesSubplot:xlabel='freetime', ylabel='goout'>,
        <AxesSubplot:xlabel='failures', ylabel='goout'>,
        <AxesSubplot:xlabel='studytime', ylabel='goout'>,
        <AxesSubplot:xlabel='G1', ylabel='goout'>],
       [<AxesSubplot:xlabel='Talc', ylabel='freetime'>,
        <AxesSubplot:xlabel='goout', ylabel='freetime'>,
        <AxesSubplot:xlabel='freetime', ylabel='freetime'>,
        <AxesSubplot:xlabel='failures', ylabel='freetime'>,
        <AxesSubplot:xlabel='studytime', ylabel='freetime'>,
        <AxesSubplot:xlabel='G1', ylabel='freetime'>],
       [<AxesSubplot:xlabel='Talc', ylabel='failures'>,
        <AxesSubplot:xlabel='goout', ylabel='failures'>,
        <AxesSubplot:xlabel='freetime', ylabel='failures'>,
        <AxesSubplot:xlabel='failures', ylabel='failures'>,
        <AxesSubplot:xlabel='studytime', ylabel='failures'>,
        <AxesSubplot:xlabel='G1', ylabel='failures'>],
       [<AxesSubplot:xlabel='Talc', ylabel='studytime'>,
        <AxesSubplot:xlabel='goout', ylabel='studytime'>,
        <AxesSubplot:xlabel='freetime', ylabel='studytime'>,
        <AxesSubplot:xlabel='failures', ylabel='studytime'>,
        <AxesSubplot:xlabel='studytime', ylabel='studytime'>,
        <AxesSubplot:xlabel='G1', ylabel='studytime'>],
       [<AxesSubplot:xlabel='Talc', ylabel='G1'>,
        <AxesSubplot:xlabel='goout', ylabel='G1'>,
        <AxesSubplot:xlabel='freetime', ylabel='G1'>,
        <AxesSubplot:xlabel='failures', ylabel='G1'>,
        <AxesSubplot:xlabel='studytime', ylabel='G1'>,
        <AxesSubplot:xlabel='G1', ylabel='G1'>]], dtype=object)


import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", color_codes=True)

sns.catplot(x="sex", y="Talc", kind="swarm", data=students)

/opt/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 64.2% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/opt/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 44.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)

<seaborn.axisgrid.FacetGrid at 0x7f8138d3d730>


students['AvgGrade'] = (students['G1'] + students['G2'] + students['G3']) / 3
students['AvgPedu'] = (students['Fedu'] + students['Medu']) / 2
students['g/f'] = students['goout'] / students['freetime']


corr_matrix = students.corr()
corr_matrix['Talc'].sort_values(ascending=False)

Talc          1.000000
goout         0.372098
freetime      0.177675
traveltime    0.165437
absences      0.149713
failures      0.140723
health        0.129867
age           0.119320
g/f           0.109831
Fedu         -0.003216
AvgPedu      -0.008157
Medu         -0.011320
G3           -0.059093
G2           -0.083255
AvgGrade     -0.089461
G1           -0.123448
famrel       -0.143482
studytime    -0.264073
Name: Talc, dtype: float64


students = students.drop(columns=['AvgGrade', 'AvgPedu', 'g/f'])


students = strat_train_set.drop('Talc', axis=1)
students_labels = strat_train_set['Talc'].copy()


num_attribs = ['goout', 'freetime', 'traveltime', 'absences', 'failures', 'health', 'age', 'Fedu', 'Medu',
              'G3', 'G2', 'G1', 'famrel', 'studytime']
students_num = students[num_attribs]
students_cat = students.drop(columns=num_attribs, axis=1)


#dropping unnecessary attributes
for i in ['Fedu', 'Medu', 'failures']:
    num_attribs.remove(i)
students_num = students_num.drop(columns=['Fedu', 'Medu', 'failures'], axis=1)

students_cat = students_cat.drop(columns=['school'], axis=1)

students_num.head()


students_cat.head()


from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
students_cat_1hot = encoder.fit_transform(students_cat)
students_cat_1hot

<632x41 sparse matrix of type '<class 'numpy.float64'>'
	with 10112 stored elements in Compressed Sparse Row format>


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

students_num_tr = num_pipeline.fit_transform(students_num)


from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values


from sklearn.preprocessing import OneHotEncoder
cat_attribs = list(students_cat)

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('one_hot_encoder', OneHotEncoder()),
])


from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion([
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])


students_prepared = full_pipeline.fit_transform(students)
students_prepared

<632x52 sparse matrix of type '<class 'numpy.float64'>'
	with 17064 stored elements in Compressed Sparse Row format>


students_prepared.shape

(632, 52)


from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(students_prepared, students_labels)

LinearRegression()


some_data = students.iloc[:5]
some_labels = students_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print('Predicitons:', lin_reg.predict(some_data_prepared))
print('Labels:', list(some_labels))

Predicitons: [3.60050654 2.66409047 4.74485776 5.7124963  3.20760412]
Labels: [2, 3, 9, 6, 4]


import numpy as np

from sklearn.metrics import mean_squared_error
students_predictions = lin_reg.predict(students_prepared)
lin_mse = mean_squared_error(students_labels, students_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

1.5333858911789295


from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(students_prepared, students_labels)

DecisionTreeRegressor()


students_predictions = tree_reg.predict(students_prepared)
tree_mse = mean_squared_error(students_labels, students_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0


from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, students_prepared, students_labels,
                        scoring='neg_mean_squared_error', cv=10)
tree_rmse_scores = np.sqrt(-scores)


def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard deviation:', scores.std())
    
display_scores(tree_rmse_scores)

Scores: [1.11803399 1.19895788 1.48002574 1.5430335  1.58865022 1.30930734
 0.67846699 0.78679579 0.89087081 1.20185043]
Mean: 1.17959926879395
Standard deviation: 0.29998996276407486


lin_scores = cross_val_score(lin_reg, students_prepared, students_labels,
                            scoring='neg_mean_squared_error', cv=10)

lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [1.55535414 1.52362908 1.86213837 1.49956349 1.70911196 1.73180764
 1.8846926  1.6193409  1.37894009 1.58108828]
Mean: 1.634566655788508
Standard deviation: 0.15316951655112016


from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(students_prepared, students_labels)
students_predictions = forest_reg.predict(students_prepared)
forest_mse = mean_squared_error(students_labels, students_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.3488527762447518


forest_scores = cross_val_score(forest_reg, students_prepared, students_labels,
                            scoring='neg_mean_squared_error', cv=10)

forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [1.04961749 1.04525939 1.18561952 0.92073987 1.18697695 1.1550256
 0.91874483 0.87086893 0.81685129 0.9284934 ]
Mean: 1.0078197273697205
Standard deviation: 0.12867120721007622


from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators': [3,10,30], 'max_features': [2, 4, 6, 8]},
             {'bootstrap': [False], 'n_estimators': [3, 10, 30], 'max_features': [2, 4, 8]}]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(students_prepared, students_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 4, 8],
                          'n_estimators': [3, 10, 30]}],
             scoring='neg_mean_squared_error')


grid_search.best_params_

{'bootstrap': False, 'max_features': 8, 'n_estimators': 30}


cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

1.5100918689994982 {'max_features': 2, 'n_estimators': 3}
1.2102037745161012 {'max_features': 2, 'n_estimators': 10}
1.1221781488955314 {'max_features': 2, 'n_estimators': 30}
1.4390972333780414 {'max_features': 4, 'n_estimators': 3}
1.1610383816801808 {'max_features': 4, 'n_estimators': 10}
1.137086416328642 {'max_features': 4, 'n_estimators': 30}
1.3802870739468447 {'max_features': 6, 'n_estimators': 3}
1.2045101782068197 {'max_features': 6, 'n_estimators': 10}
1.1092775960615457 {'max_features': 6, 'n_estimators': 30}
1.4080673127355239 {'max_features': 8, 'n_estimators': 3}
1.1849268410107359 {'max_features': 8, 'n_estimators': 10}
1.1328201378934335 {'max_features': 8, 'n_estimators': 30}
1.0642466454655621 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.9987349828929811 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
0.9650762025634528 {'bootstrap': False, 'max_features': 2, 'n_estimators': 30}
1.1630134863355066 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
0.9960504432202819 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}
0.9739229611515573 {'bootstrap': False, 'max_features': 4, 'n_estimators': 30}
1.127809161964813 {'bootstrap': False, 'max_features': 8, 'n_estimators': 3}
1.0055676775860112 {'bootstrap': False, 'max_features': 8, 'n_estimators': 10}
0.9313698017841407 {'bootstrap': False, 'max_features': 8, 'n_estimators': 30}


feature_importances = grid_search.best_estimator_.feature_importances_

cat_encoder = cat_pipeline.named_steps['one_hot_encoder']
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.09521609333521416, 'goout'),
 (0.0761405528667081, 'absences'),
 (0.056828081726636295, 'G3'),
 (0.05275332948729314, 'G2'),
 (0.04415222720340103, 'G1'),
 (0.04206895730466228, 'traveltime'),
 (0.04205720413629535, 'age'),
 (0.04121722039447791, 'M'),
 (0.040705648481339424, 'famrel'),
 (0.04040620244690753, 'studytime'),
 (0.035927554500979224, 'health'),
 (0.0312516559041118, 'F'),
 (0.02648904618240103, 'freetime')]


final_model = grid_search.best_estimator_

X_test = strat_test_set.drop('Talc', axis=1)
y_test = strat_test_set['Talc'].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)


final_rmse

0.6714595643478087

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2	G3
count	790.000000	790.000000	790.000000	790.000000	790.000000	790.000000	790.000000	790.000000	790.000000	790.000000	790.000000	790.000000	790.000000	790.000000	790.000000	790.000000
mean	16.696203	2.749367	2.521519	1.448101	2.035443	0.334177	3.944304	3.235443	3.108861	1.481013	2.291139	3.554430	5.708861	10.908861	10.713924	10.415190
std	1.275234	1.094041	1.087511	0.697063	0.838708	0.743180	0.896090	0.998229	1.112572	0.890177	1.287080	1.389422	7.998022	3.317091	3.759120	4.578538
min	15.000000	0.000000	0.000000	1.000000	1.000000	0.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	0.000000	3.000000	0.000000	0.000000
25%	16.000000	2.000000	2.000000	1.000000	1.000000	0.000000	4.000000	3.000000	2.000000	1.000000	1.000000	3.000000	0.000000	8.000000	9.000000	8.000000
50%	17.000000	3.000000	2.000000	1.000000	2.000000	0.000000	4.000000	3.000000	3.000000	1.000000	2.000000	4.000000	4.000000	11.000000	11.000000	11.000000
75%	18.000000	4.000000	3.000000	2.000000	2.000000	0.000000	5.000000	4.000000	4.000000	2.000000	3.000000	5.000000	8.000000	13.000000	13.000000	14.000000
max	22.000000	4.000000	4.000000	4.000000	4.000000	3.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	75.000000	19.000000	19.000000	20.000000

	sex	address	famsize	Pstatus	Mjob	Fjob	reason	guardian	schoolsup	famsup	paid	activities	nursery	higher	internet	romantic
363	F	U	LE3	T	at_home	at_home	course	mother	no	yes	yes	yes	yes	yes	yes	yes
229	F	U	GT3	A	other	other	course	mother	no	no	no	yes	yes	yes	yes	yes
228	M	U	LE3	T	at_home	other	course	mother	yes	yes	yes	yes	yes	yes	yes	yes
27	M	U	GT3	T	health	services	other	mother	no	no	yes	no	yes	yes	yes	no
322	F	R	LE3	T	services	services	course	mother	no	yes	yes	yes	yes	yes	yes	no

Data Preperation¶

Load Data¶

Quick Look at Dataset¶

Create Test Set¶

Gaining Insights on the Data¶

Experimenting with Attribute Combinations¶

Data Cleaning¶

Text and Categorical Attributes¶

Creation of Pipelines¶

Training¶

Evaluating on Test Set¶

Conclusion¶

	school	sex	age	address	famsize	Pstatus	Medu	Fedu	Mjob	Fjob	...	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2	G3
0	GP	F	18	U	GT3	A	4	4	at_home	teacher	...	4	3	4	1	1	3	6	5	6	6
1	GP	F	17	U	GT3	T	1	1	at_home	other	...	5	3	3	1	1	3	4	5	5	6
2	GP	F	15	U	LE3	T	1	1	at_home	other	...	4	3	2	2	3	3	10	7	8	10
3	GP	F	15	U	GT3	T	4	2	health	services	...	3	2	2	1	1	5	2	15	14	15
4	GP	F	16	U	GT3	T	3	3	other	other	...	4	3	2	1	2	5	4	6	10	10

	goout	freetime	traveltime	absences	health	age	G3	G2	G1	famrel	studytime
363	4	3	1	0	1	17	15	15	16	2	2
229	3	2	2	10	3	17	12	10	12	3	3
228	2	3	4	14	3	18	9	8	10	4	2
27	4	2	1	4	1	15	15	16	15	2	1
322	2	3	1	3	3	17	11	11	11	3	3

	goout	freetime	traveltime	absences	health	age	G3	G2	G1	famrel	studytime
363	4	3	1	0	1	17	15	15	16	2	2
229	3	2	2	10	3	17	12	10	12	3	3
228	2	3	4	14	3	18	9	8	10	4	2
27	4	2	1	4	1	15	15	16	15	2	1
322	2	3	1	3	3	17	11	11	11	3	3

	goout	freetime	traveltime	absences	health	age	G3	G2	G1	famrel	studytime
363	4	3	1	0	1	17	15	15	16	2	2
229	3	2	2	10	3	17	12	10	12	3	3
228	2	3	4	14	3	18	9	8	10	4	2
27	4	2	1	4	1	15	15	16	15	2	1
322	2	3	1	3	3	17	11	11	11	3	3