import os

AVOCADOS_PATH = os.path.join("datasets", "avocados")


import pandas as pd

def load_avocados_data(avocados_path=AVOCADOS_PATH):
    csv_path = os.path.join(avocados_path, "avocado.csv")
    return pd.read_csv(csv_path)


avocados = load_avocados_data()
avocados.head()


avocados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    18249 non-null  int64  
 1   Date          18249 non-null  object 
 2   AveragePrice  18249 non-null  float64
 3   Total Volume  18249 non-null  float64
 4   4046          18249 non-null  float64
 5   4225          18249 non-null  float64
 6   4770          18249 non-null  float64
 7   Total Bags    18249 non-null  float64
 8   Small Bags    18249 non-null  float64
 9   Large Bags    18249 non-null  float64
 10  XLarge Bags   18249 non-null  float64
 11  type          18249 non-null  object 
 12  year          18249 non-null  int64  
 13  region        18249 non-null  object 
dtypes: float64(9), int64(2), object(3)
memory usage: 1.9+ MB


avocados["type"].value_counts()

conventional    9126
organic         9123
Name: type, dtype: int64


avocados["region"].value_counts()

DallasFtWorth          338
Roanoke                338
HartfordSpringfield    338
West                   338
SouthCentral           338
Chicago                338
RaleighGreensboro      338
Indianapolis           338
CincinnatiDayton       338
BuffaloRochester       338
Houston                338
Orlando                338
RichmondNorfolk        338
Jacksonville           338
Albany                 338
Atlanta                338
BaltimoreWashington    338
LosAngeles             338
Denver                 338
Portland               338
California             338
Nashville              338
Boise                  338
MiamiFtLauderdale      338
Boston                 338
Sacramento             338
Pittsburgh             338
Plains                 338
NorthernNewEngland     338
SouthCarolina          338
TotalUS                338
GrandRapids            338
Syracuse               338
SanFrancisco           338
Philadelphia           338
Spokane                338
Seattle                338
Tampa                  338
NewYork                338
SanDiego               338
NewOrleansMobile       338
Midsouth               338
Columbus               338
Louisville             338
Charlotte              338
LasVegas               338
Northeast              338
HarrisburgScranton     338
PhoenixTucson          338
Southeast              338
StLouis                338
GreatLakes             338
Detroit                338
WestTexNewMexico       335
Name: region, dtype: int64


avocados.describe()


%matplotlib inline
import matplotlib.pyplot as plt
avocados.hist(bins=50, figsize=(20,15))
plt.show()


#will change everytime it is ran
#eventually showing the model the entire dataset

import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]


from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]


avocados_with_id = avocados.reset_index() # adds an 'index column'
train_set, test_set = split_train_test_by_id(avocados_with_id, 0.2, "index")


from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(avocados, test_size=0.2, random_state=42)


#Checking if the sets were split evenly
train_set["type"].value_counts()

conventional    7306
organic         7293
Name: type, dtype: int64


test_set["type"].value_counts()

organic         1830
conventional    1820
Name: type, dtype: int64


corr_matrix = avocados.corr()


corr_matrix["AveragePrice"].sort_values(ascending=False)

AveragePrice    1.000000
year            0.093197
XLarge Bags    -0.117592
Unnamed: 0     -0.133008
4225           -0.172928
Large Bags     -0.172940
Small Bags     -0.174730
Total Bags     -0.177088
4770           -0.179446
Total Volume   -0.192752
4046           -0.208317
Name: AveragePrice, dtype: float64


from pandas.plotting import scatter_matrix

attributes = ["AveragePrice", "4046", "Total Volume", "4770"]
scatter_matrix(avocados[attributes], figsize=(12,8))

array([[<AxesSubplot:xlabel='AveragePrice', ylabel='AveragePrice'>,
        <AxesSubplot:xlabel='4046', ylabel='AveragePrice'>,
        <AxesSubplot:xlabel='Total Volume', ylabel='AveragePrice'>,
        <AxesSubplot:xlabel='4770', ylabel='AveragePrice'>],
       [<AxesSubplot:xlabel='AveragePrice', ylabel='4046'>,
        <AxesSubplot:xlabel='4046', ylabel='4046'>,
        <AxesSubplot:xlabel='Total Volume', ylabel='4046'>,
        <AxesSubplot:xlabel='4770', ylabel='4046'>],
       [<AxesSubplot:xlabel='AveragePrice', ylabel='Total Volume'>,
        <AxesSubplot:xlabel='4046', ylabel='Total Volume'>,
        <AxesSubplot:xlabel='Total Volume', ylabel='Total Volume'>,
        <AxesSubplot:xlabel='4770', ylabel='Total Volume'>],
       [<AxesSubplot:xlabel='AveragePrice', ylabel='4770'>,
        <AxesSubplot:xlabel='4046', ylabel='4770'>,
        <AxesSubplot:xlabel='Total Volume', ylabel='4770'>,
        <AxesSubplot:xlabel='4770', ylabel='4770'>]], dtype=object)


avocados.plot(kind="scatter", x="Total Volume", y="AveragePrice",
             alpha=0.1)

<AxesSubplot:xlabel='Total Volume', ylabel='AveragePrice'>


avocados["volume_per_bag"] = avocados["Total Volume"]/avocados["Total Bags"]
avocados["4046_ratio"] = avocados["4046"]/avocados["Total Volume"]
avocados["4225_ratio"] = avocados["4225"]/avocados["Total Volume"]
avocados["4770_ratio"] = avocados["4770"]/avocados["Total Volume"]


avocados.describe()


corr_matrix = avocados.corr()
corr_matrix["AveragePrice"].sort_values(ascending=False)

AveragePrice      1.000000
4225_ratio        0.156095
year              0.093197
volume_per_bag    0.043009
XLarge Bags      -0.117592
4770_ratio       -0.126823
Unnamed: 0       -0.133008
4225             -0.172928
Large Bags       -0.172940
Small Bags       -0.174730
Total Bags       -0.177088
4770             -0.179446
Total Volume     -0.192752
4046             -0.208317
4046_ratio       -0.341860
Name: AveragePrice, dtype: float64


attributes = ["AveragePrice", "4046_ratio", "4770_ratio", "4225_ratio"]
scatter_matrix(avocados[attributes], figsize=(12,8))

array([[<AxesSubplot:xlabel='AveragePrice', ylabel='AveragePrice'>,
        <AxesSubplot:xlabel='4046_ratio', ylabel='AveragePrice'>,
        <AxesSubplot:xlabel='4770_ratio', ylabel='AveragePrice'>,
        <AxesSubplot:xlabel='4225_ratio', ylabel='AveragePrice'>],
       [<AxesSubplot:xlabel='AveragePrice', ylabel='4046_ratio'>,
        <AxesSubplot:xlabel='4046_ratio', ylabel='4046_ratio'>,
        <AxesSubplot:xlabel='4770_ratio', ylabel='4046_ratio'>,
        <AxesSubplot:xlabel='4225_ratio', ylabel='4046_ratio'>],
       [<AxesSubplot:xlabel='AveragePrice', ylabel='4770_ratio'>,
        <AxesSubplot:xlabel='4046_ratio', ylabel='4770_ratio'>,
        <AxesSubplot:xlabel='4770_ratio', ylabel='4770_ratio'>,
        <AxesSubplot:xlabel='4225_ratio', ylabel='4770_ratio'>],
       [<AxesSubplot:xlabel='AveragePrice', ylabel='4225_ratio'>,
        <AxesSubplot:xlabel='4046_ratio', ylabel='4225_ratio'>,
        <AxesSubplot:xlabel='4770_ratio', ylabel='4225_ratio'>,
        <AxesSubplot:xlabel='4225_ratio', ylabel='4225_ratio'>]],
      dtype=object)


avocados.plot(kind="scatter", x="volume_per_bag", y="AveragePrice",
             alpha=0.1)

<AxesSubplot:xlabel='volume_per_bag', ylabel='AveragePrice'>


avocados["volume_per_bag"].max()

inf


avocados.describe()


avocados["Total Bags"].count()

18249


avocados["Total Volume"].max()

62505646.52


###fixing Total Bags having 0 for a value
def indexes_for_value(data, value):
    indexes = []
    for i in range(len(data)):
        if data[i]==value:
            indexes.append(i)
    return indexes
            
def replaceValueMean(data, value):
    indexes = indexes_for_value(data,value)
    total=0
    count=len(data)-len(indexes)
    for i in range(len(data)):
        if data[i]!=value:
            total+=data[i]
    mean=total/count
    for i in indexes:
        data[i] = mean
    return data


indexes_for_value(avocados["Total Bags"], 0)

[9212,
 11024,
 11320,
 11321,
 11333,
 11347,
 11348,
 11387,
 11388,
 11594,
 11595,
 11596,
 11597,
 11614,
 11662]


replaceValueMean(avocados["Total Bags"], 0)

<ipython-input-28-caa511a548a3>:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = mean

0         8696.87
1         9505.56
2         8145.35
3         5811.16
4         6183.95
           ...   
18244    13498.67
18245     9264.84
18246     9394.11
18247    10969.54
18248    12014.15
Name: Total Bags, Length: 18249, dtype: float64


avocados["volume_per_bag"] = avocados["Total Volume"]/avocados["Total Bags"]


avocados.describe()


train_set, test_set = train_test_split(avocados, test_size=0.2, random_state=42)
avocados = train_set.copy()


avocados_labels = train_set["AveragePrice"].copy()
avocados = train_set.drop("AveragePrice", axis=1)


#substitute for missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")


avocados = avocados.drop("Unnamed: 0", axis=1)


avocados.describe()


avocados_num = avocados.drop(["Date", "type", "year", "region"], axis=1)


imputer.fit(avocados_num)

SimpleImputer(strategy='median')


imputer.statistics_

array([1.09111930e+05, 8.76248000e+03, 2.97112500e+04, 1.90580000e+02,
       4.04179800e+04, 2.65588900e+04, 2.69384000e+03, 0.00000000e+00,
       2.82593365e+00, 1.58193823e-01, 3.13863273e-01, 1.57822821e-03])


avocados_num.median().values

array([1.09111930e+05, 8.76248000e+03, 2.97112500e+04, 1.90580000e+02,
       4.04179800e+04, 2.65588900e+04, 2.69384000e+03, 0.00000000e+00,
       2.82593365e+00, 1.58193823e-01, 3.13863273e-01, 1.57822821e-03])


X = imputer.transform(avocados_num)


avocados_tr = pd.DataFrame(X, columns=avocados_num.columns)


def toMonth(dates):
    Month= []
    for date in dates:
        Month.append(int(date[5:7]))
    return pd.DataFrame(Month, columns=["Month"])


avocados["Month"]=toMonth(avocados["Date"])


avocados.describe()


from sklearn.preprocessing import OneHotEncoder


#sparse=False to make it a dense Matrix
type_encoder = OneHotEncoder()
avocados_type_reshaped = avocados["type"].values.reshape(-1,1)
avocados_type_1hot = type_encoder.fit_transform(avocados_type_reshaped)
avocados_type_1hot

<14599x2 sparse matrix of type '<class 'numpy.float64'>'
	with 14599 stored elements in Compressed Sparse Row format>


type_encoder.categories_

[array(['conventional', 'organic'], dtype=object)]


region_encoder = OneHotEncoder()
avocados_region_reshaped = avocados["region"].values.reshape(-1,1)
avocados_region_1hot = region_encoder.fit_transform(avocados_region_reshaped)
avocados_region_1hot

<14599x54 sparse matrix of type '<class 'numpy.float64'>'
	with 14599 stored elements in Compressed Sparse Row format>


region_encoder.categories_

[array(['Albany', 'Atlanta', 'BaltimoreWashington', 'Boise', 'Boston',
        'BuffaloRochester', 'California', 'Charlotte', 'Chicago',
        'CincinnatiDayton', 'Columbus', 'DallasFtWorth', 'Denver',
        'Detroit', 'GrandRapids', 'GreatLakes', 'HarrisburgScranton',
        'HartfordSpringfield', 'Houston', 'Indianapolis', 'Jacksonville',
        'LasVegas', 'LosAngeles', 'Louisville', 'MiamiFtLauderdale',
        'Midsouth', 'Nashville', 'NewOrleansMobile', 'NewYork',
        'Northeast', 'NorthernNewEngland', 'Orlando', 'Philadelphia',
        'PhoenixTucson', 'Pittsburgh', 'Plains', 'Portland',
        'RaleighGreensboro', 'RichmondNorfolk', 'Roanoke', 'Sacramento',
        'SanDiego', 'SanFrancisco', 'Seattle', 'SouthCarolina',
        'SouthCentral', 'Southeast', 'Spokane', 'StLouis', 'Syracuse',
        'Tampa', 'TotalUS', 'West', 'WestTexNewMexico'], dtype=object)]


from sklearn.base import BaseEstimator, TransformerMixin

#indexes
volume_ix, ix_4046, ix_4225, ix_4770, bags_ix = 1, 2, 3, 4, 5

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_volume_per_bag = True):
        self.add_volume_per_bag = add_volume_per_bag
    def fit(self, X, y=None):
        return self #nothing else to do
    def transform(self, X, y=None):
        ratio_4046 = X[:, ix_4046] / X[:, volume_ix]
        ratio_4225 = X[:, ix_4225] / X[:, volume_ix]
        ratio_4770 = X[:, ix_4770] / X[:, volume_ix]
        if self.add_volume_per_bag:
            volume_per_bag = X[:, volume_ix] / replaceValueMean(X[:, bags_ix], 0)
            return np.c_[X,ratio_4046, ratio_4225, ratio_4770, volume_per_bag]
        else:
            return np.c_[X,ratio_4046, ratio_4225, ratio_4770]
        
attr_adder = CombinedAttributesAdder(add_volume_per_bag=False)
avocados_extra_attribs = attr_adder.transform(avocados.values)


avocados_num.describe()


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
    ('attribs_adder', CombinedAttributesAdder()),
])

avocados_num_tr = num_pipeline.fit_transform(avocados_num)


from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values


num_attribs = list(avocados_num)
cat_attribs = ["type", "region"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler0', StandardScaler()),
    ('attribs_adder', CombinedAttributesAdder(add_volume_per_bag=False)),
    ('std_scaler1', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder()),
])


from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])


avocados_prepared = full_pipeline.fit_transform(avocados)


avocados_prepared

<14599x71 sparse matrix of type '<class 'numpy.float64'>'
	with 248183 stored elements in Compressed Sparse Row format>


avocados_prepared.shape

(14599, 71)


from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(avocados_prepared, avocados_labels)

LinearRegression()


some_data = avocados.iloc[:5]
some_labels = avocados_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

Predictions: [1.72196892 1.08580474 1.51202689 0.99730759 1.87637844]
Labels: [1.88, 0.81, 2.01, 1.02, 1.53]


from sklearn.metrics import mean_squared_error
avocados_predictions = lin_reg.predict(avocados_prepared)
lin_mse = mean_squared_error(avocados_labels, avocados_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.2650199722094959


from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(avocados_prepared, avocados_labels)

DecisionTreeRegressor()


avocados_predictions = tree_reg.predict(avocados_prepared)
tree_mse = mean_squared_error(avocados_labels, avocados_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

1.1767131303343808e-17


from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, avocados_prepared, avocados_labels,
                        scoring = "neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)


def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())


display_scores(tree_rmse_scores)

Scores: [0.24568788 0.23994349 0.23141805 0.25191581 0.2435969  0.23373017
 0.2424422  0.24040975 0.24400469 0.24816061]
Mean: 0.24213095548650268
Standard deviation: 0.005861702130644642


lin_scores = cross_val_score(lin_reg, avocados_prepared, avocados_labels,
                        scoring = "neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)


display_scores(lin_rmse_scores)

Scores: [0.24568788 0.23994349 0.23141805 0.25191581 0.2435969  0.23373017
 0.2424422  0.24040975 0.24400469 0.24816061]
Mean: 0.24213095548650268
Standard deviation: 0.005861702130644642


from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(avocados_prepared, avocados_labels)

RandomForestRegressor()


avocados_predictions = forest_reg.predict(avocados_prepared)
forest_mse = mean_squared_error(avocados_labels, avocados_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.06255144638968366


forest_scores = cross_val_score(forest_reg, avocados_prepared, avocados_labels,
                               scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [0.16596757 0.16742614 0.1704742  0.1704822  0.17054483 0.16186383
 0.17120198 0.17965895 0.17411243 0.17798801]
Mean: 0.17097201474640739
Standard deviation: 0.005058151190141573


from sklearn.svm import SVR
SVR_reg = SVR()
SVR_reg.fit(avocados_prepared, avocados_labels)

SVR()


avocados_predictions = SVR_reg.predict(avocados_prepared)
SVR_mse = mean_squared_error(avocados_labels, avocados_predictions)
SVR_rmse = np.sqrt(SVR_mse)
SVR_rmse

0.21843106358993253


SVR_scores = cross_val_score(SVR_reg, avocados_prepared, avocados_labels,
                               scoring="neg_mean_squared_error", cv=10)
SVR_rmse_scores = np.sqrt(-SVR_scores)
display_scores(SVR_rmse_scores)

Scores: [0.22171831 0.2261141  0.22090493 0.21825387 0.22662064 0.21606678
 0.2329331  0.23183207 0.22789418 0.22293031]
Mean: 0.22452682879791705
Standard deviation: 0.005260720932364574


from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3,10,30], 'max_features': [2,4,6,8]},
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2,3,4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error')

grid_search.fit(avocados_prepared, avocados_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             scoring='neg_mean_squared_error')


grid_search.best_params_

{'max_features': 2, 'n_estimators': 30}


cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.2116363942459375 {'max_features': 2, 'n_estimators': 3}
0.18121439333957437 {'max_features': 2, 'n_estimators': 10}
0.1728490827684643 {'max_features': 2, 'n_estimators': 30}
0.2129284130657655 {'max_features': 4, 'n_estimators': 3}
0.18283133410679514 {'max_features': 4, 'n_estimators': 10}
0.1745044427561634 {'max_features': 4, 'n_estimators': 30}
0.21046989651130088 {'max_features': 6, 'n_estimators': 3}
0.1856221001647122 {'max_features': 6, 'n_estimators': 10}
0.1758508601890044 {'max_features': 6, 'n_estimators': 30}
0.21468792020110378 {'max_features': 8, 'n_estimators': 3}
0.18266417807260465 {'max_features': 8, 'n_estimators': 10}
0.17597667701609293 {'max_features': 8, 'n_estimators': 30}
0.2004828883916928 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.1732568969539697 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
0.1971529452298 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
0.17499132060465458 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
0.2002905008860829 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
0.17498023087381942 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}


feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.06970893, 0.05294806, 0.06390707, 0.05290015, 0.05963685,
       0.06443708, 0.06422035, 0.03020833, 0.03681935, 0.04057721,
       0.04708022, 0.0416463 , 0.0648686 , 0.06912735, 0.03794594,
       0.04115925, 0.05027523, 0.00077781, 0.00105933, 0.001451  ,
       0.00142611, 0.00136066, 0.00103711, 0.00194799, 0.00107922,
       0.00150251, 0.00290214, 0.00217187, 0.00378784, 0.00162447,
       0.00145296, 0.00177337, 0.00052173, 0.00133792, 0.00657373,
       0.00323851, 0.00215259, 0.00140908, 0.00198641, 0.00133939,
       0.00235662, 0.00124424, 0.0007067 , 0.00306852, 0.00087995,
       0.00569818, 0.00439279, 0.00094297, 0.00157611, 0.00252846,
       0.0030411 , 0.00147902, 0.00128897, 0.00189814, 0.00161016,
       0.00283592, 0.00212788, 0.00233279, 0.00159348, 0.00957793,
       0.00218824, 0.00073087, 0.0022978 , 0.00118673, 0.00224612,
       0.0014772 , 0.00055826, 0.00072755, 0.0012682 , 0.00218302,
       0.00257604])


extra_attribs = ["ratio_4046", "ratio_4225", "ratio_4770"]
cat_encoder = cat_pipeline.named_steps["cat_encoder"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse =True)

[(0.06970892635486484, 'Total Volume'),
 (0.06912734626746812, 'organic'),
 (0.06486860263608833, 'conventional'),
 (0.0644370847531914, 'Small Bags'),
 (0.0642203465505144, 'Large Bags'),
 (0.0639070679721322, '4225'),
 (0.05963684904651195, 'Total Bags'),
 (0.05294806173116172, '4046'),
 (0.05290015336792799, '4770'),
 (0.04708021940211616, '4225_ratio'),
 (0.04164629930207952, '4770_ratio'),
 (0.040577214856152885, '4046_ratio'),
 (0.03681934627802098, 'volume_per_bag'),
 (0.030208334402390943, 'XLarge Bags')]


final_model = grid_search.best_estimator_

X_test = test_set.drop("AveragePrice", axis=1)
y_test = test_set["AveragePrice"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

0.16731021443379146

	Unnamed: 0	Date	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	type	year	region
0	0	2015-12-27	1.33	64236.62	1036.74	54454.85	48.16	8696.87	8603.62	93.25	conventional	2015	Albany
1	1	2015-12-20	1.35	54876.98	674.28	44638.81	58.33	9505.56	9408.07	97.49	conventional	2015	Albany
2	2	2015-12-13	0.93	118220.22	794.70	109149.67	130.50	8145.35	8042.21	103.14	conventional	2015	Albany
3	3	2015-12-06	1.08	78992.15	1132.00	71976.41	72.58	5811.16	5677.40	133.76	conventional	2015	Albany
4	4	2015-11-29	1.28	51039.60	941.48	43838.39	75.78	6183.95	5986.26	197.69	conventional	2015	Albany

	Unnamed: 0	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	XLarge Bags	year
count	18249.000000	18249.000000	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	18249.000000	18249.000000
mean	24.232232	1.405978	8.506440e+05	2.930084e+05	2.951546e+05	2.283974e+04	2.396392e+05	1.821947e+05	5.433809e+04	3106.426507	2016.147899
std	15.481045	0.402677	3.453545e+06	1.264989e+06	1.204120e+06	1.074641e+05	9.862424e+05	7.461785e+05	2.439660e+05	17692.894652	0.939938
min	0.000000	0.440000	8.456000e+01	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000	2015.000000
25%	10.000000	1.100000	1.083858e+04	8.540700e+02	3.008780e+03	0.000000e+00	5.088640e+03	2.849420e+03	1.274700e+02	0.000000	2015.000000
50%	24.000000	1.370000	1.073768e+05	8.645300e+03	2.906102e+04	1.849900e+02	3.974383e+04	2.636282e+04	2.647710e+03	0.000000	2016.000000
75%	38.000000	1.660000	4.329623e+05	1.110202e+05	1.502069e+05	6.243420e+03	1.107834e+05	8.333767e+04	2.202925e+04	132.500000	2017.000000
max	52.000000	3.250000	6.250565e+07	2.274362e+07	2.047057e+07	2.546439e+06	1.937313e+07	1.338459e+07	5.719097e+06	551693.650000	2018.000000

	Unnamed: 0	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	XLarge Bags	year	volume_per_bag	4046_ratio	4225_ratio	4770_ratio
count	18249.000000	18249.000000	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	18249.000000	18249.000000	1.824900e+04	18249.000000	18249.000000	18249.000000
mean	24.232232	1.405978	8.506440e+05	2.930084e+05	2.951546e+05	2.283974e+04	2.396392e+05	1.821947e+05	5.433809e+04	3106.426507	2016.147899	inf	0.225185	0.347391	0.019313
std	15.481045	0.402677	3.453545e+06	1.264989e+06	1.204120e+06	1.074641e+05	9.862424e+05	7.461785e+05	2.439660e+05	17692.894652	0.939938	NaN	0.213921	0.234912	0.041866
min	0.000000	0.440000	8.456000e+01	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000	2015.000000	1.001824e+00	0.000000	0.000000	0.000000
25%	10.000000	1.100000	1.083858e+04	8.540700e+02	3.008780e+03	0.000000e+00	5.088640e+03	2.849420e+03	1.274700e+02	0.000000	2015.000000	1.788166e+00	0.032779	0.157279	0.000000
50%	24.000000	1.370000	1.073768e+05	8.645300e+03	2.906102e+04	1.849900e+02	3.974383e+04	2.636282e+04	2.647710e+03	0.000000	2016.000000	2.832644e+00	0.157837	0.316862	0.001516
75%	38.000000	1.660000	4.329623e+05	1.110202e+05	1.502069e+05	6.243420e+03	1.107834e+05	8.333767e+04	2.202925e+04	132.500000	2017.000000	4.368265e+00	0.378737	0.517175	0.019184
max	52.000000	3.250000	6.250565e+07	2.274362e+07	2.047057e+07	2.546439e+06	1.937313e+07	1.338459e+07	5.719097e+06	551693.650000	2018.000000	inf	0.972523	0.992628	0.448719

	Unnamed: 0	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	XLarge Bags	year	volume_per_bag	4046_ratio	4225_ratio	4770_ratio
count	18249.000000	18249.000000	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	18249.000000	18249.000000	1.824900e+04	18249.000000	18249.000000	18249.000000
mean	24.232232	1.405978	8.506440e+05	2.930084e+05	2.951546e+05	2.283974e+04	2.396392e+05	1.821947e+05	5.433809e+04	3106.426507	2016.147899	inf	0.225185	0.347391	0.019313
std	15.481045	0.402677	3.453545e+06	1.264989e+06	1.204120e+06	1.074641e+05	9.862424e+05	7.461785e+05	2.439660e+05	17692.894652	0.939938	NaN	0.213921	0.234912	0.041866
min	0.000000	0.440000	8.456000e+01	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000	2015.000000	1.001824e+00	0.000000	0.000000	0.000000
25%	10.000000	1.100000	1.083858e+04	8.540700e+02	3.008780e+03	0.000000e+00	5.088640e+03	2.849420e+03	1.274700e+02	0.000000	2015.000000	1.788166e+00	0.032779	0.157279	0.000000
50%	24.000000	1.370000	1.073768e+05	8.645300e+03	2.906102e+04	1.849900e+02	3.974383e+04	2.636282e+04	2.647710e+03	0.000000	2016.000000	2.832644e+00	0.157837	0.316862	0.001516
75%	38.000000	1.660000	4.329623e+05	1.110202e+05	1.502069e+05	6.243420e+03	1.107834e+05	8.333767e+04	2.202925e+04	132.500000	2017.000000	4.368265e+00	0.378737	0.517175	0.019184
max	52.000000	3.250000	6.250565e+07	2.274362e+07	2.047057e+07	2.546439e+06	1.937313e+07	1.338459e+07	5.719097e+06	551693.650000	2018.000000	inf	0.972523	0.992628	0.448719

	Unnamed: 0	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	XLarge Bags	year	volume_per_bag	4046_ratio	4225_ratio	4770_ratio
count	18249.000000	18249.000000	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	18249.000000	18249.000000	18249.000000	18249.000000	18249.000000	18249.000000
mean	24.232232	1.405978	8.506440e+05	2.930084e+05	2.951546e+05	2.283974e+04	2.398363e+05	1.821947e+05	5.433809e+04	3106.426507	2016.147899	8.406657	0.225185	0.347391	0.019313
std	15.481045	0.402677	3.453545e+06	1.264989e+06	1.204120e+06	1.074641e+05	9.862184e+05	7.461785e+05	2.439660e+05	17692.894652	0.939938	116.793082	0.213921	0.234912	0.041866
min	0.000000	0.440000	8.456000e+01	0.000000e+00	0.000000e+00	0.000000e+00	3.090000e+00	0.000000e+00	0.000000e+00	0.000000	2015.000000	0.011618	0.000000	0.000000	0.000000
25%	10.000000	1.100000	1.083858e+04	8.540700e+02	3.008780e+03	0.000000e+00	5.108010e+03	2.849420e+03	1.274700e+02	0.000000	2015.000000	1.784211	0.032779	0.157279	0.000000
50%	24.000000	1.370000	1.073768e+05	8.645300e+03	2.906102e+04	1.849900e+02	3.992521e+04	2.636282e+04	2.647710e+03	0.000000	2016.000000	2.828519	0.157837	0.316862	0.001516
75%	38.000000	1.660000	4.329623e+05	1.110202e+05	1.502069e+05	6.243420e+03	1.112384e+05	8.333767e+04	2.202925e+04	132.500000	2017.000000	4.361341	0.378737	0.517175	0.019184
max	52.000000	3.250000	6.250565e+07	2.274362e+07	2.047057e+07	2.546439e+06	1.937313e+07	1.338459e+07	5.719097e+06	551693.650000	2018.000000	7708.260450	0.972523	0.992628	0.448719

Price Prediction of Avocados¶

Data Preperation¶

Quick Look at Dataset¶

Create Train and Test Sets¶

Gaining Insights on the Data¶

Data Cleaning¶

Text and Categorical Attributes¶

Scaling and Pipeline¶

Training¶

Conclusion¶

	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	XLarge Bags	year	volume_per_bag	4046_ratio	4225_ratio	4770_ratio
count	1.459900e+04	1.459900e+04	1.459900e+04	1.459900e+04	1.459900e+04	1.459900e+04	1.459900e+04	14599.000000	14599.000000	14599.000000	14599.000000	14599.000000	14599.000000
mean	8.486369e+05	2.934236e+05	2.943944e+05	2.277926e+04	2.382348e+05	1.808815e+05	5.403939e+04	3116.713649	2016.148777	8.733382	0.225752	0.345798	0.019209
std	3.404709e+06	1.243118e+06	1.195564e+06	1.076246e+05	9.670298e+05	7.305218e+05	2.401997e+05	17607.337329	0.940367	127.947791	0.214598	0.234828	0.041561
min	8.456000e+01	0.000000e+00	0.000000e+00	0.000000e+00	3.090000e+00	0.000000e+00	0.000000e+00	0.000000	2015.000000	0.011618	0.000000	0.000000	0.000000
25%	1.086533e+04	8.547250e+02	2.997545e+03	0.000000e+00	5.160235e+03	2.884600e+03	1.236350e+02	0.000000	2015.000000	1.772124	0.032368	0.156055	0.000000
50%	1.091119e+05	8.762480e+03	2.971125e+04	1.905800e+02	4.041798e+04	2.655889e+04	2.693840e+03	0.000000	2016.000000	2.825934	0.158194	0.313863	0.001578
75%	4.354375e+05	1.113795e+05	1.507282e+05	6.231185e+03	1.121544e+05	8.370342e+04	2.242839e+04	133.925000	2017.000000	4.358727	0.379883	0.514511	0.019113
max	5.228870e+07	1.778761e+07	2.047057e+07	2.546439e+06	1.639452e+07	1.254033e+07	4.324231e+06	551693.650000	2018.000000	7708.260450	0.972523	0.992628	0.448719