import os
AVOCADOS_PATH = os.path.join("datasets", "avocados")
import pandas as pd
def load_avocados_data(avocados_path=AVOCADOS_PATH):
csv_path = os.path.join(avocados_path, "avocado.csv")
return pd.read_csv(csv_path)
Looking at histograms, format of attributes, among other things.
avocados = load_avocados_data()
avocados.head()
Unnamed: 0 | Date | AveragePrice | Total Volume | 4046 | 4225 | 4770 | Total Bags | Small Bags | Large Bags | XLarge Bags | type | year | region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2015-12-27 | 1.33 | 64236.62 | 1036.74 | 54454.85 | 48.16 | 8696.87 | 8603.62 | 93.25 | 0.0 | conventional | 2015 | Albany |
1 | 1 | 2015-12-20 | 1.35 | 54876.98 | 674.28 | 44638.81 | 58.33 | 9505.56 | 9408.07 | 97.49 | 0.0 | conventional | 2015 | Albany |
2 | 2 | 2015-12-13 | 0.93 | 118220.22 | 794.70 | 109149.67 | 130.50 | 8145.35 | 8042.21 | 103.14 | 0.0 | conventional | 2015 | Albany |
3 | 3 | 2015-12-06 | 1.08 | 78992.15 | 1132.00 | 71976.41 | 72.58 | 5811.16 | 5677.40 | 133.76 | 0.0 | conventional | 2015 | Albany |
4 | 4 | 2015-11-29 | 1.28 | 51039.60 | 941.48 | 43838.39 | 75.78 | 6183.95 | 5986.26 | 197.69 | 0.0 | conventional | 2015 | Albany |
avocados.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 18249 entries, 0 to 18248 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 18249 non-null int64 1 Date 18249 non-null object 2 AveragePrice 18249 non-null float64 3 Total Volume 18249 non-null float64 4 4046 18249 non-null float64 5 4225 18249 non-null float64 6 4770 18249 non-null float64 7 Total Bags 18249 non-null float64 8 Small Bags 18249 non-null float64 9 Large Bags 18249 non-null float64 10 XLarge Bags 18249 non-null float64 11 type 18249 non-null object 12 year 18249 non-null int64 13 region 18249 non-null object dtypes: float64(9), int64(2), object(3) memory usage: 1.9+ MB
avocados["type"].value_counts()
conventional 9126 organic 9123 Name: type, dtype: int64
avocados["region"].value_counts()
DallasFtWorth 338 Roanoke 338 HartfordSpringfield 338 West 338 SouthCentral 338 Chicago 338 RaleighGreensboro 338 Indianapolis 338 CincinnatiDayton 338 BuffaloRochester 338 Houston 338 Orlando 338 RichmondNorfolk 338 Jacksonville 338 Albany 338 Atlanta 338 BaltimoreWashington 338 LosAngeles 338 Denver 338 Portland 338 California 338 Nashville 338 Boise 338 MiamiFtLauderdale 338 Boston 338 Sacramento 338 Pittsburgh 338 Plains 338 NorthernNewEngland 338 SouthCarolina 338 TotalUS 338 GrandRapids 338 Syracuse 338 SanFrancisco 338 Philadelphia 338 Spokane 338 Seattle 338 Tampa 338 NewYork 338 SanDiego 338 NewOrleansMobile 338 Midsouth 338 Columbus 338 Louisville 338 Charlotte 338 LasVegas 338 Northeast 338 HarrisburgScranton 338 PhoenixTucson 338 Southeast 338 StLouis 338 GreatLakes 338 Detroit 338 WestTexNewMexico 335 Name: region, dtype: int64
avocados.describe()
Unnamed: 0 | AveragePrice | Total Volume | 4046 | 4225 | 4770 | Total Bags | Small Bags | Large Bags | XLarge Bags | year | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 18249.000000 | 18249.000000 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 18249.000000 | 18249.000000 |
mean | 24.232232 | 1.405978 | 8.506440e+05 | 2.930084e+05 | 2.951546e+05 | 2.283974e+04 | 2.396392e+05 | 1.821947e+05 | 5.433809e+04 | 3106.426507 | 2016.147899 |
std | 15.481045 | 0.402677 | 3.453545e+06 | 1.264989e+06 | 1.204120e+06 | 1.074641e+05 | 9.862424e+05 | 7.461785e+05 | 2.439660e+05 | 17692.894652 | 0.939938 |
min | 0.000000 | 0.440000 | 8.456000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 2015.000000 |
25% | 10.000000 | 1.100000 | 1.083858e+04 | 8.540700e+02 | 3.008780e+03 | 0.000000e+00 | 5.088640e+03 | 2.849420e+03 | 1.274700e+02 | 0.000000 | 2015.000000 |
50% | 24.000000 | 1.370000 | 1.073768e+05 | 8.645300e+03 | 2.906102e+04 | 1.849900e+02 | 3.974383e+04 | 2.636282e+04 | 2.647710e+03 | 0.000000 | 2016.000000 |
75% | 38.000000 | 1.660000 | 4.329623e+05 | 1.110202e+05 | 1.502069e+05 | 6.243420e+03 | 1.107834e+05 | 8.333767e+04 | 2.202925e+04 | 132.500000 | 2017.000000 |
max | 52.000000 | 3.250000 | 6.250565e+07 | 2.274362e+07 | 2.047057e+07 | 2.546439e+06 | 1.937313e+07 | 1.338459e+07 | 5.719097e+06 | 551693.650000 | 2018.000000 |
%matplotlib inline
import matplotlib.pyplot as plt
avocados.hist(bins=50, figsize=(20,15))
plt.show()
I used a simple shuffled data set with test and train ratios splitting the data set into two.
#will change everytime it is ran
#eventually showing the model the entire dataset
import numpy as np
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
from zlib import crc32
def test_set_check(identifier, test_ratio):
return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32
def split_train_test_by_id(data, test_ratio, id_column):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
return data.loc[~in_test_set], data.loc[in_test_set]
avocados_with_id = avocados.reset_index() # adds an 'index column'
train_set, test_set = split_train_test_by_id(avocados_with_id, 0.2, "index")
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(avocados, test_size=0.2, random_state=42)
#Checking if the sets were split evenly
train_set["type"].value_counts()
conventional 7306 organic 7293 Name: type, dtype: int64
test_set["type"].value_counts()
organic 1830 conventional 1820 Name: type, dtype: int64
Total Volume is the sum of 4225, 4770, and 4046. I created attributes to show the percentage of the total volume for each of the PLU attributes. The new 4046_ratio attribute turned out to be the number one correlated attribute with AveragePrice.
corr_matrix = avocados.corr()
corr_matrix["AveragePrice"].sort_values(ascending=False)
AveragePrice 1.000000 year 0.093197 XLarge Bags -0.117592 Unnamed: 0 -0.133008 4225 -0.172928 Large Bags -0.172940 Small Bags -0.174730 Total Bags -0.177088 4770 -0.179446 Total Volume -0.192752 4046 -0.208317 Name: AveragePrice, dtype: float64
from pandas.plotting import scatter_matrix
attributes = ["AveragePrice", "4046", "Total Volume", "4770"]
scatter_matrix(avocados[attributes], figsize=(12,8))
array([[<AxesSubplot:xlabel='AveragePrice', ylabel='AveragePrice'>, <AxesSubplot:xlabel='4046', ylabel='AveragePrice'>, <AxesSubplot:xlabel='Total Volume', ylabel='AveragePrice'>, <AxesSubplot:xlabel='4770', ylabel='AveragePrice'>], [<AxesSubplot:xlabel='AveragePrice', ylabel='4046'>, <AxesSubplot:xlabel='4046', ylabel='4046'>, <AxesSubplot:xlabel='Total Volume', ylabel='4046'>, <AxesSubplot:xlabel='4770', ylabel='4046'>], [<AxesSubplot:xlabel='AveragePrice', ylabel='Total Volume'>, <AxesSubplot:xlabel='4046', ylabel='Total Volume'>, <AxesSubplot:xlabel='Total Volume', ylabel='Total Volume'>, <AxesSubplot:xlabel='4770', ylabel='Total Volume'>], [<AxesSubplot:xlabel='AveragePrice', ylabel='4770'>, <AxesSubplot:xlabel='4046', ylabel='4770'>, <AxesSubplot:xlabel='Total Volume', ylabel='4770'>, <AxesSubplot:xlabel='4770', ylabel='4770'>]], dtype=object)
avocados.plot(kind="scatter", x="Total Volume", y="AveragePrice",
alpha=0.1)
<AxesSubplot:xlabel='Total Volume', ylabel='AveragePrice'>
avocados["volume_per_bag"] = avocados["Total Volume"]/avocados["Total Bags"]
avocados["4046_ratio"] = avocados["4046"]/avocados["Total Volume"]
avocados["4225_ratio"] = avocados["4225"]/avocados["Total Volume"]
avocados["4770_ratio"] = avocados["4770"]/avocados["Total Volume"]
avocados.describe()
Unnamed: 0 | AveragePrice | Total Volume | 4046 | 4225 | 4770 | Total Bags | Small Bags | Large Bags | XLarge Bags | year | volume_per_bag | 4046_ratio | 4225_ratio | 4770_ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 18249.000000 | 18249.000000 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 18249.000000 | 18249.000000 | 1.824900e+04 | 18249.000000 | 18249.000000 | 18249.000000 |
mean | 24.232232 | 1.405978 | 8.506440e+05 | 2.930084e+05 | 2.951546e+05 | 2.283974e+04 | 2.396392e+05 | 1.821947e+05 | 5.433809e+04 | 3106.426507 | 2016.147899 | inf | 0.225185 | 0.347391 | 0.019313 |
std | 15.481045 | 0.402677 | 3.453545e+06 | 1.264989e+06 | 1.204120e+06 | 1.074641e+05 | 9.862424e+05 | 7.461785e+05 | 2.439660e+05 | 17692.894652 | 0.939938 | NaN | 0.213921 | 0.234912 | 0.041866 |
min | 0.000000 | 0.440000 | 8.456000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 2015.000000 | 1.001824e+00 | 0.000000 | 0.000000 | 0.000000 |
25% | 10.000000 | 1.100000 | 1.083858e+04 | 8.540700e+02 | 3.008780e+03 | 0.000000e+00 | 5.088640e+03 | 2.849420e+03 | 1.274700e+02 | 0.000000 | 2015.000000 | 1.788166e+00 | 0.032779 | 0.157279 | 0.000000 |
50% | 24.000000 | 1.370000 | 1.073768e+05 | 8.645300e+03 | 2.906102e+04 | 1.849900e+02 | 3.974383e+04 | 2.636282e+04 | 2.647710e+03 | 0.000000 | 2016.000000 | 2.832644e+00 | 0.157837 | 0.316862 | 0.001516 |
75% | 38.000000 | 1.660000 | 4.329623e+05 | 1.110202e+05 | 1.502069e+05 | 6.243420e+03 | 1.107834e+05 | 8.333767e+04 | 2.202925e+04 | 132.500000 | 2017.000000 | 4.368265e+00 | 0.378737 | 0.517175 | 0.019184 |
max | 52.000000 | 3.250000 | 6.250565e+07 | 2.274362e+07 | 2.047057e+07 | 2.546439e+06 | 1.937313e+07 | 1.338459e+07 | 5.719097e+06 | 551693.650000 | 2018.000000 | inf | 0.972523 | 0.992628 | 0.448719 |
corr_matrix = avocados.corr()
corr_matrix["AveragePrice"].sort_values(ascending=False)
AveragePrice 1.000000 4225_ratio 0.156095 year 0.093197 volume_per_bag 0.043009 XLarge Bags -0.117592 4770_ratio -0.126823 Unnamed: 0 -0.133008 4225 -0.172928 Large Bags -0.172940 Small Bags -0.174730 Total Bags -0.177088 4770 -0.179446 Total Volume -0.192752 4046 -0.208317 4046_ratio -0.341860 Name: AveragePrice, dtype: float64
attributes = ["AveragePrice", "4046_ratio", "4770_ratio", "4225_ratio"]
scatter_matrix(avocados[attributes], figsize=(12,8))
array([[<AxesSubplot:xlabel='AveragePrice', ylabel='AveragePrice'>, <AxesSubplot:xlabel='4046_ratio', ylabel='AveragePrice'>, <AxesSubplot:xlabel='4770_ratio', ylabel='AveragePrice'>, <AxesSubplot:xlabel='4225_ratio', ylabel='AveragePrice'>], [<AxesSubplot:xlabel='AveragePrice', ylabel='4046_ratio'>, <AxesSubplot:xlabel='4046_ratio', ylabel='4046_ratio'>, <AxesSubplot:xlabel='4770_ratio', ylabel='4046_ratio'>, <AxesSubplot:xlabel='4225_ratio', ylabel='4046_ratio'>], [<AxesSubplot:xlabel='AveragePrice', ylabel='4770_ratio'>, <AxesSubplot:xlabel='4046_ratio', ylabel='4770_ratio'>, <AxesSubplot:xlabel='4770_ratio', ylabel='4770_ratio'>, <AxesSubplot:xlabel='4225_ratio', ylabel='4770_ratio'>], [<AxesSubplot:xlabel='AveragePrice', ylabel='4225_ratio'>, <AxesSubplot:xlabel='4046_ratio', ylabel='4225_ratio'>, <AxesSubplot:xlabel='4770_ratio', ylabel='4225_ratio'>, <AxesSubplot:xlabel='4225_ratio', ylabel='4225_ratio'>]], dtype=object)
avocados.plot(kind="scatter", x="volume_per_bag", y="AveragePrice",
alpha=0.1)
<AxesSubplot:xlabel='volume_per_bag', ylabel='AveragePrice'>
Using the mean value of each attribute, the missing and null values are replaced.
avocados["volume_per_bag"].max()
inf
avocados.describe()
Unnamed: 0 | AveragePrice | Total Volume | 4046 | 4225 | 4770 | Total Bags | Small Bags | Large Bags | XLarge Bags | year | volume_per_bag | 4046_ratio | 4225_ratio | 4770_ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 18249.000000 | 18249.000000 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 18249.000000 | 18249.000000 | 1.824900e+04 | 18249.000000 | 18249.000000 | 18249.000000 |
mean | 24.232232 | 1.405978 | 8.506440e+05 | 2.930084e+05 | 2.951546e+05 | 2.283974e+04 | 2.396392e+05 | 1.821947e+05 | 5.433809e+04 | 3106.426507 | 2016.147899 | inf | 0.225185 | 0.347391 | 0.019313 |
std | 15.481045 | 0.402677 | 3.453545e+06 | 1.264989e+06 | 1.204120e+06 | 1.074641e+05 | 9.862424e+05 | 7.461785e+05 | 2.439660e+05 | 17692.894652 | 0.939938 | NaN | 0.213921 | 0.234912 | 0.041866 |
min | 0.000000 | 0.440000 | 8.456000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 2015.000000 | 1.001824e+00 | 0.000000 | 0.000000 | 0.000000 |
25% | 10.000000 | 1.100000 | 1.083858e+04 | 8.540700e+02 | 3.008780e+03 | 0.000000e+00 | 5.088640e+03 | 2.849420e+03 | 1.274700e+02 | 0.000000 | 2015.000000 | 1.788166e+00 | 0.032779 | 0.157279 | 0.000000 |
50% | 24.000000 | 1.370000 | 1.073768e+05 | 8.645300e+03 | 2.906102e+04 | 1.849900e+02 | 3.974383e+04 | 2.636282e+04 | 2.647710e+03 | 0.000000 | 2016.000000 | 2.832644e+00 | 0.157837 | 0.316862 | 0.001516 |
75% | 38.000000 | 1.660000 | 4.329623e+05 | 1.110202e+05 | 1.502069e+05 | 6.243420e+03 | 1.107834e+05 | 8.333767e+04 | 2.202925e+04 | 132.500000 | 2017.000000 | 4.368265e+00 | 0.378737 | 0.517175 | 0.019184 |
max | 52.000000 | 3.250000 | 6.250565e+07 | 2.274362e+07 | 2.047057e+07 | 2.546439e+06 | 1.937313e+07 | 1.338459e+07 | 5.719097e+06 | 551693.650000 | 2018.000000 | inf | 0.972523 | 0.992628 | 0.448719 |
avocados["Total Bags"].count()
18249
avocados["Total Volume"].max()
62505646.52
###fixing Total Bags having 0 for a value
def indexes_for_value(data, value):
indexes = []
for i in range(len(data)):
if data[i]==value:
indexes.append(i)
return indexes
def replaceValueMean(data, value):
indexes = indexes_for_value(data,value)
total=0
count=len(data)-len(indexes)
for i in range(len(data)):
if data[i]!=value:
total+=data[i]
mean=total/count
for i in indexes:
data[i] = mean
return data
indexes_for_value(avocados["Total Bags"], 0)
[9212, 11024, 11320, 11321, 11333, 11347, 11348, 11387, 11388, 11594, 11595, 11596, 11597, 11614, 11662]
replaceValueMean(avocados["Total Bags"], 0)
<ipython-input-28-caa511a548a3>:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data[i] = mean
0 8696.87 1 9505.56 2 8145.35 3 5811.16 4 6183.95 ... 18244 13498.67 18245 9264.84 18246 9394.11 18247 10969.54 18248 12014.15 Name: Total Bags, Length: 18249, dtype: float64
avocados["volume_per_bag"] = avocados["Total Volume"]/avocados["Total Bags"]
avocados.describe()
Unnamed: 0 | AveragePrice | Total Volume | 4046 | 4225 | 4770 | Total Bags | Small Bags | Large Bags | XLarge Bags | year | volume_per_bag | 4046_ratio | 4225_ratio | 4770_ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 18249.000000 | 18249.000000 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 1.824900e+04 | 18249.000000 | 18249.000000 | 18249.000000 | 18249.000000 | 18249.000000 | 18249.000000 |
mean | 24.232232 | 1.405978 | 8.506440e+05 | 2.930084e+05 | 2.951546e+05 | 2.283974e+04 | 2.398363e+05 | 1.821947e+05 | 5.433809e+04 | 3106.426507 | 2016.147899 | 8.406657 | 0.225185 | 0.347391 | 0.019313 |
std | 15.481045 | 0.402677 | 3.453545e+06 | 1.264989e+06 | 1.204120e+06 | 1.074641e+05 | 9.862184e+05 | 7.461785e+05 | 2.439660e+05 | 17692.894652 | 0.939938 | 116.793082 | 0.213921 | 0.234912 | 0.041866 |
min | 0.000000 | 0.440000 | 8.456000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 3.090000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 2015.000000 | 0.011618 | 0.000000 | 0.000000 | 0.000000 |
25% | 10.000000 | 1.100000 | 1.083858e+04 | 8.540700e+02 | 3.008780e+03 | 0.000000e+00 | 5.108010e+03 | 2.849420e+03 | 1.274700e+02 | 0.000000 | 2015.000000 | 1.784211 | 0.032779 | 0.157279 | 0.000000 |
50% | 24.000000 | 1.370000 | 1.073768e+05 | 8.645300e+03 | 2.906102e+04 | 1.849900e+02 | 3.992521e+04 | 2.636282e+04 | 2.647710e+03 | 0.000000 | 2016.000000 | 2.828519 | 0.157837 | 0.316862 | 0.001516 |
75% | 38.000000 | 1.660000 | 4.329623e+05 | 1.110202e+05 | 1.502069e+05 | 6.243420e+03 | 1.112384e+05 | 8.333767e+04 | 2.202925e+04 | 132.500000 | 2017.000000 | 4.361341 | 0.378737 | 0.517175 | 0.019184 |
max | 52.000000 | 3.250000 | 6.250565e+07 | 2.274362e+07 | 2.047057e+07 | 2.546439e+06 | 1.937313e+07 | 1.338459e+07 | 5.719097e+06 | 551693.650000 | 2018.000000 | 7708.260450 | 0.972523 | 0.992628 | 0.448719 |
train_set, test_set = train_test_split(avocados, test_size=0.2, random_state=42)
avocados = train_set.copy()
avocados_labels = train_set["AveragePrice"].copy()
avocados = train_set.drop("AveragePrice", axis=1)
#substitute for missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
avocados = avocados.drop("Unnamed: 0", axis=1)
avocados.describe()
Total Volume | 4046 | 4225 | 4770 | Total Bags | Small Bags | Large Bags | XLarge Bags | year | volume_per_bag | 4046_ratio | 4225_ratio | 4770_ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 14599.000000 | 14599.000000 | 14599.000000 | 14599.000000 | 14599.000000 | 14599.000000 |
mean | 8.486369e+05 | 2.934236e+05 | 2.943944e+05 | 2.277926e+04 | 2.382348e+05 | 1.808815e+05 | 5.403939e+04 | 3116.713649 | 2016.148777 | 8.733382 | 0.225752 | 0.345798 | 0.019209 |
std | 3.404709e+06 | 1.243118e+06 | 1.195564e+06 | 1.076246e+05 | 9.670298e+05 | 7.305218e+05 | 2.401997e+05 | 17607.337329 | 0.940367 | 127.947791 | 0.214598 | 0.234828 | 0.041561 |
min | 8.456000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 3.090000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 2015.000000 | 0.011618 | 0.000000 | 0.000000 | 0.000000 |
25% | 1.086533e+04 | 8.547250e+02 | 2.997545e+03 | 0.000000e+00 | 5.160235e+03 | 2.884600e+03 | 1.236350e+02 | 0.000000 | 2015.000000 | 1.772124 | 0.032368 | 0.156055 | 0.000000 |
50% | 1.091119e+05 | 8.762480e+03 | 2.971125e+04 | 1.905800e+02 | 4.041798e+04 | 2.655889e+04 | 2.693840e+03 | 0.000000 | 2016.000000 | 2.825934 | 0.158194 | 0.313863 | 0.001578 |
75% | 4.354375e+05 | 1.113795e+05 | 1.507282e+05 | 6.231185e+03 | 1.121544e+05 | 8.370342e+04 | 2.242839e+04 | 133.925000 | 2017.000000 | 4.358727 | 0.379883 | 0.514511 | 0.019113 |
max | 5.228870e+07 | 1.778761e+07 | 2.047057e+07 | 2.546439e+06 | 1.639452e+07 | 1.254033e+07 | 4.324231e+06 | 551693.650000 | 2018.000000 | 7708.260450 | 0.972523 | 0.992628 | 0.448719 |
avocados_num = avocados.drop(["Date", "type", "year", "region"], axis=1)
imputer.fit(avocados_num)
SimpleImputer(strategy='median')
imputer.statistics_
array([1.09111930e+05, 8.76248000e+03, 2.97112500e+04, 1.90580000e+02, 4.04179800e+04, 2.65588900e+04, 2.69384000e+03, 0.00000000e+00, 2.82593365e+00, 1.58193823e-01, 3.13863273e-01, 1.57822821e-03])
avocados_num.median().values
array([1.09111930e+05, 8.76248000e+03, 2.97112500e+04, 1.90580000e+02, 4.04179800e+04, 2.65588900e+04, 2.69384000e+03, 0.00000000e+00, 2.82593365e+00, 1.58193823e-01, 3.13863273e-01, 1.57822821e-03])
X = imputer.transform(avocados_num)
avocados_tr = pd.DataFrame(X, columns=avocados_num.columns)
Changed date to month to get a more accurate representation of which season the purchase was in.
def toMonth(dates):
Month= []
for date in dates:
Month.append(int(date[5:7]))
return pd.DataFrame(Month, columns=["Month"])
avocados["Month"]=toMonth(avocados["Date"])
avocados.describe()
Total Volume | 4046 | 4225 | 4770 | Total Bags | Small Bags | Large Bags | XLarge Bags | year | volume_per_bag | 4046_ratio | 4225_ratio | 4770_ratio | Month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 14599.000000 | 14599.000000 | 14599.000000 | 14599.000000 | 14599.000000 | 14599.000000 | 11652.000000 |
mean | 8.486369e+05 | 2.934236e+05 | 2.943944e+05 | 2.277926e+04 | 2.382348e+05 | 1.808815e+05 | 5.403939e+04 | 3116.713649 | 2016.148777 | 8.733382 | 0.225752 | 0.345798 | 0.019209 | 6.217216 |
std | 3.404709e+06 | 1.243118e+06 | 1.195564e+06 | 1.076246e+05 | 9.670298e+05 | 7.305218e+05 | 2.401997e+05 | 17607.337329 | 0.940367 | 127.947791 | 0.214598 | 0.234828 | 0.041561 | 3.540092 |
min | 8.456000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 3.090000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 2015.000000 | 0.011618 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
25% | 1.086533e+04 | 8.547250e+02 | 2.997545e+03 | 0.000000e+00 | 5.160235e+03 | 2.884600e+03 | 1.236350e+02 | 0.000000 | 2015.000000 | 1.772124 | 0.032368 | 0.156055 | 0.000000 | 3.000000 |
50% | 1.091119e+05 | 8.762480e+03 | 2.971125e+04 | 1.905800e+02 | 4.041798e+04 | 2.655889e+04 | 2.693840e+03 | 0.000000 | 2016.000000 | 2.825934 | 0.158194 | 0.313863 | 0.001578 | 6.000000 |
75% | 4.354375e+05 | 1.113795e+05 | 1.507282e+05 | 6.231185e+03 | 1.121544e+05 | 8.370342e+04 | 2.242839e+04 | 133.925000 | 2017.000000 | 4.358727 | 0.379883 | 0.514511 | 0.019113 | 9.000000 |
max | 5.228870e+07 | 1.778761e+07 | 2.047057e+07 | 2.546439e+06 | 1.639452e+07 | 1.254033e+07 | 4.324231e+06 | 551693.650000 | 2018.000000 | 7708.260450 | 0.972523 | 0.992628 | 0.448719 | 12.000000 |
I will be using the OneHotEncoder to encode the categories.
from sklearn.preprocessing import OneHotEncoder
#sparse=False to make it a dense Matrix
type_encoder = OneHotEncoder()
avocados_type_reshaped = avocados["type"].values.reshape(-1,1)
avocados_type_1hot = type_encoder.fit_transform(avocados_type_reshaped)
avocados_type_1hot
<14599x2 sparse matrix of type '<class 'numpy.float64'>' with 14599 stored elements in Compressed Sparse Row format>
type_encoder.categories_
[array(['conventional', 'organic'], dtype=object)]
region_encoder = OneHotEncoder()
avocados_region_reshaped = avocados["region"].values.reshape(-1,1)
avocados_region_1hot = region_encoder.fit_transform(avocados_region_reshaped)
avocados_region_1hot
<14599x54 sparse matrix of type '<class 'numpy.float64'>' with 14599 stored elements in Compressed Sparse Row format>
region_encoder.categories_
[array(['Albany', 'Atlanta', 'BaltimoreWashington', 'Boise', 'Boston', 'BuffaloRochester', 'California', 'Charlotte', 'Chicago', 'CincinnatiDayton', 'Columbus', 'DallasFtWorth', 'Denver', 'Detroit', 'GrandRapids', 'GreatLakes', 'HarrisburgScranton', 'HartfordSpringfield', 'Houston', 'Indianapolis', 'Jacksonville', 'LasVegas', 'LosAngeles', 'Louisville', 'MiamiFtLauderdale', 'Midsouth', 'Nashville', 'NewOrleansMobile', 'NewYork', 'Northeast', 'NorthernNewEngland', 'Orlando', 'Philadelphia', 'PhoenixTucson', 'Pittsburgh', 'Plains', 'Portland', 'RaleighGreensboro', 'RichmondNorfolk', 'Roanoke', 'Sacramento', 'SanDiego', 'SanFrancisco', 'Seattle', 'SouthCarolina', 'SouthCentral', 'Southeast', 'Spokane', 'StLouis', 'Syracuse', 'Tampa', 'TotalUS', 'West', 'WestTexNewMexico'], dtype=object)]
from sklearn.base import BaseEstimator, TransformerMixin
#indexes
volume_ix, ix_4046, ix_4225, ix_4770, bags_ix = 1, 2, 3, 4, 5
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_volume_per_bag = True):
self.add_volume_per_bag = add_volume_per_bag
def fit(self, X, y=None):
return self #nothing else to do
def transform(self, X, y=None):
ratio_4046 = X[:, ix_4046] / X[:, volume_ix]
ratio_4225 = X[:, ix_4225] / X[:, volume_ix]
ratio_4770 = X[:, ix_4770] / X[:, volume_ix]
if self.add_volume_per_bag:
volume_per_bag = X[:, volume_ix] / replaceValueMean(X[:, bags_ix], 0)
return np.c_[X,ratio_4046, ratio_4225, ratio_4770, volume_per_bag]
else:
return np.c_[X,ratio_4046, ratio_4225, ratio_4770]
attr_adder = CombinedAttributesAdder(add_volume_per_bag=False)
avocados_extra_attribs = attr_adder.transform(avocados.values)
avocados_num.describe()
Total Volume | 4046 | 4225 | 4770 | Total Bags | Small Bags | Large Bags | XLarge Bags | volume_per_bag | 4046_ratio | 4225_ratio | 4770_ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 1.459900e+04 | 14599.000000 | 14599.000000 | 14599.000000 | 14599.000000 | 14599.000000 |
mean | 8.486369e+05 | 2.934236e+05 | 2.943944e+05 | 2.277926e+04 | 2.382348e+05 | 1.808815e+05 | 5.403939e+04 | 3116.713649 | 8.733382 | 0.225752 | 0.345798 | 0.019209 |
std | 3.404709e+06 | 1.243118e+06 | 1.195564e+06 | 1.076246e+05 | 9.670298e+05 | 7.305218e+05 | 2.401997e+05 | 17607.337329 | 127.947791 | 0.214598 | 0.234828 | 0.041561 |
min | 8.456000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 3.090000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.011618 | 0.000000 | 0.000000 | 0.000000 |
25% | 1.086533e+04 | 8.547250e+02 | 2.997545e+03 | 0.000000e+00 | 5.160235e+03 | 2.884600e+03 | 1.236350e+02 | 0.000000 | 1.772124 | 0.032368 | 0.156055 | 0.000000 |
50% | 1.091119e+05 | 8.762480e+03 | 2.971125e+04 | 1.905800e+02 | 4.041798e+04 | 2.655889e+04 | 2.693840e+03 | 0.000000 | 2.825934 | 0.158194 | 0.313863 | 0.001578 |
75% | 4.354375e+05 | 1.113795e+05 | 1.507282e+05 | 6.231185e+03 | 1.121544e+05 | 8.370342e+04 | 2.242839e+04 | 133.925000 | 4.358727 | 0.379883 | 0.514511 | 0.019113 |
max | 5.228870e+07 | 1.778761e+07 | 2.047057e+07 | 2.546439e+06 | 1.639452e+07 | 1.254033e+07 | 4.324231e+06 | 551693.650000 | 7708.260450 | 0.972523 | 0.992628 | 0.448719 |
I scaled all the values with sklearn's StandardScaler.
I also created a num and cat pipeline along with a pipeline combining the two.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
('attribs_adder', CombinedAttributesAdder()),
])
avocados_num_tr = num_pipeline.fit_transform(avocados_num)
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
num_attribs = list(avocados_num)
cat_attribs = ["type", "region"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', SimpleImputer(strategy="median")),
('std_scaler0', StandardScaler()),
('attribs_adder', CombinedAttributesAdder(add_volume_per_bag=False)),
('std_scaler1', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', OneHotEncoder()),
])
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list = [
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
avocados_prepared = full_pipeline.fit_transform(avocados)
avocados_prepared
<14599x71 sparse matrix of type '<class 'numpy.float64'>' with 248183 stored elements in Compressed Sparse Row format>
avocados_prepared.shape
(14599, 71)
The random forest regression model had the lowest rmse, so we will be moving forward with that model.
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(avocados_prepared, avocados_labels)
LinearRegression()
some_data = avocados.iloc[:5]
some_labels = avocados_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))
Predictions: [1.72196892 1.08580474 1.51202689 0.99730759 1.87637844] Labels: [1.88, 0.81, 2.01, 1.02, 1.53]
from sklearn.metrics import mean_squared_error
avocados_predictions = lin_reg.predict(avocados_prepared)
lin_mse = mean_squared_error(avocados_labels, avocados_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
0.2650199722094959
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(avocados_prepared, avocados_labels)
DecisionTreeRegressor()
avocados_predictions = tree_reg.predict(avocados_prepared)
tree_mse = mean_squared_error(avocados_labels, avocados_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
1.1767131303343808e-17
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, avocados_prepared, avocados_labels,
scoring = "neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
display_scores(tree_rmse_scores)
Scores: [0.24568788 0.23994349 0.23141805 0.25191581 0.2435969 0.23373017 0.2424422 0.24040975 0.24400469 0.24816061] Mean: 0.24213095548650268 Standard deviation: 0.005861702130644642
lin_scores = cross_val_score(lin_reg, avocados_prepared, avocados_labels,
scoring = "neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)
display_scores(lin_rmse_scores)
Scores: [0.24568788 0.23994349 0.23141805 0.25191581 0.2435969 0.23373017 0.2424422 0.24040975 0.24400469 0.24816061] Mean: 0.24213095548650268 Standard deviation: 0.005861702130644642
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(avocados_prepared, avocados_labels)
RandomForestRegressor()
avocados_predictions = forest_reg.predict(avocados_prepared)
forest_mse = mean_squared_error(avocados_labels, avocados_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
0.06255144638968366
forest_scores = cross_val_score(forest_reg, avocados_prepared, avocados_labels,
scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
Scores: [0.16596757 0.16742614 0.1704742 0.1704822 0.17054483 0.16186383 0.17120198 0.17965895 0.17411243 0.17798801] Mean: 0.17097201474640739 Standard deviation: 0.005058151190141573
from sklearn.svm import SVR
SVR_reg = SVR()
SVR_reg.fit(avocados_prepared, avocados_labels)
SVR()
avocados_predictions = SVR_reg.predict(avocados_prepared)
SVR_mse = mean_squared_error(avocados_labels, avocados_predictions)
SVR_rmse = np.sqrt(SVR_mse)
SVR_rmse
0.21843106358993253
SVR_scores = cross_val_score(SVR_reg, avocados_prepared, avocados_labels,
scoring="neg_mean_squared_error", cv=10)
SVR_rmse_scores = np.sqrt(-SVR_scores)
display_scores(SVR_rmse_scores)
Scores: [0.22171831 0.2261141 0.22090493 0.21825387 0.22662064 0.21606678 0.2329331 0.23183207 0.22789418 0.22293031] Mean: 0.22452682879791705 Standard deviation: 0.005260720932364574
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [3,10,30], 'max_features': [2,4,6,8]},
{'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2,3,4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error')
grid_search.fit(avocados_prepared, avocados_labels)
GridSearchCV(cv=5, estimator=RandomForestRegressor(), param_grid=[{'max_features': [2, 4, 6, 8], 'n_estimators': [3, 10, 30]}, {'bootstrap': [False], 'max_features': [2, 3, 4], 'n_estimators': [3, 10]}], scoring='neg_mean_squared_error')
grid_search.best_params_
{'max_features': 2, 'n_estimators': 30}
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
0.2116363942459375 {'max_features': 2, 'n_estimators': 3} 0.18121439333957437 {'max_features': 2, 'n_estimators': 10} 0.1728490827684643 {'max_features': 2, 'n_estimators': 30} 0.2129284130657655 {'max_features': 4, 'n_estimators': 3} 0.18283133410679514 {'max_features': 4, 'n_estimators': 10} 0.1745044427561634 {'max_features': 4, 'n_estimators': 30} 0.21046989651130088 {'max_features': 6, 'n_estimators': 3} 0.1856221001647122 {'max_features': 6, 'n_estimators': 10} 0.1758508601890044 {'max_features': 6, 'n_estimators': 30} 0.21468792020110378 {'max_features': 8, 'n_estimators': 3} 0.18266417807260465 {'max_features': 8, 'n_estimators': 10} 0.17597667701609293 {'max_features': 8, 'n_estimators': 30} 0.2004828883916928 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3} 0.1732568969539697 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10} 0.1971529452298 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3} 0.17499132060465458 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10} 0.2002905008860829 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3} 0.17498023087381942 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
array([0.06970893, 0.05294806, 0.06390707, 0.05290015, 0.05963685, 0.06443708, 0.06422035, 0.03020833, 0.03681935, 0.04057721, 0.04708022, 0.0416463 , 0.0648686 , 0.06912735, 0.03794594, 0.04115925, 0.05027523, 0.00077781, 0.00105933, 0.001451 , 0.00142611, 0.00136066, 0.00103711, 0.00194799, 0.00107922, 0.00150251, 0.00290214, 0.00217187, 0.00378784, 0.00162447, 0.00145296, 0.00177337, 0.00052173, 0.00133792, 0.00657373, 0.00323851, 0.00215259, 0.00140908, 0.00198641, 0.00133939, 0.00235662, 0.00124424, 0.0007067 , 0.00306852, 0.00087995, 0.00569818, 0.00439279, 0.00094297, 0.00157611, 0.00252846, 0.0030411 , 0.00147902, 0.00128897, 0.00189814, 0.00161016, 0.00283592, 0.00212788, 0.00233279, 0.00159348, 0.00957793, 0.00218824, 0.00073087, 0.0022978 , 0.00118673, 0.00224612, 0.0014772 , 0.00055826, 0.00072755, 0.0012682 , 0.00218302, 0.00257604])
extra_attribs = ["ratio_4046", "ratio_4225", "ratio_4770"]
cat_encoder = cat_pipeline.named_steps["cat_encoder"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse =True)
[(0.06970892635486484, 'Total Volume'), (0.06912734626746812, 'organic'), (0.06486860263608833, 'conventional'), (0.0644370847531914, 'Small Bags'), (0.0642203465505144, 'Large Bags'), (0.0639070679721322, '4225'), (0.05963684904651195, 'Total Bags'), (0.05294806173116172, '4046'), (0.05290015336792799, '4770'), (0.04708021940211616, '4225_ratio'), (0.04164629930207952, '4770_ratio'), (0.040577214856152885, '4046_ratio'), (0.03681934627802098, 'volume_per_bag'), (0.030208334402390943, 'XLarge Bags')]
final_model = grid_search.best_estimator_
X_test = test_set.drop("AveragePrice", axis=1)
y_test = test_set["AveragePrice"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse
0.16731021443379146
The Random Forest Regression Model worked best with this dataset. The main factors (in respective order) that go into the price of an avocado are:
All conclusions made from this report rely on the fact that this database is representative of all large orders of avocados. If this is not true, all conclusions should be taken lightly.