1. Load the data-set¶

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing 
from sklearn.cluster import KMeans 
import matplotlib.pyplot as plt

We need to lead the perfume dataset from sklearn

perfume_preference = pd.read_csv("Perfume preference.csv")
perfume_preference

	Narcissus	Agrumen	Oud	Jasmine	Amber	Neroli	Indole	Vanilla	Frankincense	Bergamot	Galbanum	Magnolia	Sandalwood	Cashmeran	Citron	Opopanax	Aliphatic Aldehydes	Vetiver
0	1353.0	1252.0	4066.0	3838.0	2144.0	4404.0	32082.0	3866.0	2505.0	3972.0	4485.0	6441.0	4106.0	1722.0	4287.0	4820.0	4140.0	1463.0
1	1089.0	2152.0	4045.0	3710.0	2235.0	4352.0	30398.0	4769.0	2995.0	4720.0	4532.0	10931.0	3794.0	1638.0	4648.0	4472.0	4184.0	1071.0
2	4177.0	3592.0	3596.0	1745.0	3234.0	2116.0	21678.0	4864.0	3178.0	3381.0	1376.0	18153.0	2502.0	1733.0	1747.0	2728.0	4580.0	4742.0
3	4899.0	3738.0	2454.0	3976.0	4945.0	3853.0	17963.0	3040.0	2943.0	2870.0	4016.0	18819.0	1990.0	5118.0	2391.0	2012.0	3470.0	3057.0
4	4822.0	4030.0	3447.0	4225.0	4078.0	3772.0	23988.0	3389.0	2415.0	2695.0	3887.0	20367.0	2118.0	4530.0	2427.0	3205.0	4319.0	2289.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9492	4857.0	3654.0	2788.0	3830.0	4272.0	4349.0	20816.0	2524.0	2675.0	2550.0	4620.0	18506.0	1999.0	5329.0	2140.0	2568.0	4358.0	2361.0
9493	2040.0	2561.0	3913.0	3375.0	2058.0	4722.0	19399.0	4765.0	3796.0	2949.0	2686.0	13008.0	4201.0	1830.0	1534.0	2272.0	3348.0	2992.0
9494	4846.0	4883.0	4153.0	2108.0	4164.0	1881.0	20551.0	5030.0	2683.0	4001.0	1450.0	24684.0	3979.0	1187.0	2107.0	2508.0	4581.0	4731.0
9495	4310.0	3916.0	3937.0	2488.0	3343.0	2219.0	22914.0	5104.0	2640.0	3864.0	1730.0	19874.0	3654.0	499.0	1920.0	2971.0	4476.0	4654.0
9496	2698.0	3174.0	3984.0	3541.0	2522.0	4946.0	18512.0	5165.0	4167.0	2704.0	2536.0	16069.0	3851.0	1620.0	1830.0	2084.0	3240.0	3644.0

9497 rows × 18 columns

perfume_score = pd.read_csv("Perfume Score.csv")
perfume_score

	Narcissus	Agrumen	Oud	Jasmine	Amber	Neroli	Indole	Vanilla	Frankincense	Bergamot	Galbanum	Magnolia	Sandalwood	Cashmeran	Citron	Opopanax	Aliphatic Aldehydes	Vetiver	Scent Quality Score
0	489.766	343.510	638.519	315.377	966.417	913.256	1015.036	479.027	485.797	2918.050062	108.538	727.438	936.842	4801.306119	261.952	148.593	783.264	809.541	1.302700e+07
1	472.841	218.288	642.332	210.582	995.068	989.447	958.614	507.113	242.015	2119.074840	246.654	755.477	840.936	4896.315590	149.498	44.490	906.204	815.512	1.159073e+07
2	472.620	323.480	696.770	288.379	1006.334	875.163	987.398	611.463	410.451	2679.139347	281.022	729.155	825.386	5350.521973	177.980	141.612	705.294	794.394	1.367693e+07
3	503.155	397.632	644.533	151.414	960.097	905.462	1031.227	469.357	388.405	1784.035393	280.953	711.906	786.198	5029.939322	29.515	149.231	678.681	837.614	7.997427e+06
4	499.780	344.096	643.764	353.518	1033.988	978.976	871.312	439.266	311.002	3236.214279	272.058	737.003	898.238	4988.788504	138.884	122.238	622.090	824.174	1.113290e+07
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4999	449.162	353.896	680.031	220.188	940.716	851.543	951.874	600.726	383.718	2209.955448	122.345	852.887	973.806	5122.835105	50.474	87.481	662.834	821.282	1.072665e+07
5000	526.781	392.868	652.819	268.901	983.403	700.787	1031.042	583.384	414.174	2453.815268	368.325	770.897	825.038	5009.288848	195.544	83.047	819.217	830.439	1.426609e+07
5001	475.160	256.740	655.360	204.422	905.181	1055.073	1008.550	539.192	399.411	2007.515839	192.985	774.179	747.784	4925.275302	205.319	143.601	741.248	780.727	9.882660e+06
5002	481.422	278.652	647.467	147.307	1033.814	880.379	1053.847	510.981	410.661	1762.999938	144.866	802.051	890.813	4992.597380	116.158	80.665	804.591	792.583	9.200338e+06
5003	476.130	364.371	659.429	224.550	1207.776	837.054	882.858	625.714	361.077	2175.815594	231.943	621.123	737.455	4939.983539	231.950	178.103	710.752	867.652	1.195802e+07

5004 rows × 19 columns

2. Review the data quatitatively¶

perfume_score.describe()

	Narcissus	Agrumen	Oud	Jasmine	Amber	Neroli	Indole	Vanilla	Frankincense	Bergamot	Galbanum	Magnolia	Sandalwood	Cashmeran	Citron	Opopanax	Aliphatic Aldehydes	Vetiver	Scent Quality Score
count	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5004.000000	5.004000e+03
mean	470.922058	346.112136	674.849797	209.349424	988.363780	888.163961	944.930020	564.445091	378.302281	2186.942959	250.475366	769.066442	866.283868	5172.149509	121.493370	118.018880	703.506534	802.327559	1.079593e+07
std	23.038942	59.788428	36.724524	80.932744	74.334501	84.265546	70.080494	72.778522	60.988463	575.210895	76.697327	89.079681	87.392412	275.760510	70.881267	62.237022	99.694353	25.963051	2.867554e+06
min	383.651000	121.396000	543.403000	0.000000	652.234000	539.166000	683.213000	287.286000	142.905000	487.811886	0.000000	457.725000	545.930000	4119.640577	0.000000	0.000000	317.364000	714.678000	3.860472e+06
25%	455.292000	305.453750	650.423250	154.964250	938.166000	830.146500	896.683000	514.985750	337.244250	1804.401369	198.506500	707.927750	809.248500	4990.344908	69.145750	74.265250	635.601500	784.980750	8.777066e+06
50%	470.695500	346.160500	675.707000	209.278000	989.786500	889.092500	945.116500	563.099000	378.364000	2191.656699	249.945000	770.214000	867.822500	5174.842876	118.840500	117.010500	703.989000	802.968000	1.057777e+07
75%	486.797750	386.261000	699.350250	262.900250	1038.281500	945.673750	993.838500	612.617000	418.213250	2571.954572	301.876250	830.076000	924.809000	5358.193002	170.583000	160.389500	769.465500	819.953500	1.255241e+07
max	548.708000	562.238000	817.061000	481.593000	1258.446000	1183.733000	1214.694000	825.775000	646.347000	4079.337285	535.614000	1116.747000	1195.179000	6231.998892	396.534000	348.029000	1049.738000	888.987000	2.203800e+07

perfume_preference.describe()

	Narcissus	Agrumen	Oud	Jasmine	Amber	Neroli	Indole	Vanilla	Frankincense	Bergamot	Galbanum	Magnolia	Sandalwood	Cashmeran	Citron	Opopanax	Aliphatic Aldehydes	Vetiver
count	9494.000000	9493.000000	9489.000000	9495.000000	9494.000000	9492.000000	9489.000000	9494.000000	9491.000000	9490.000000	9489.000000	9493.000000	9492.000000	9492.000000	9487.000000	9489.000000	9492.000000	9487.000000
mean	3265.382663	3121.755083	3763.242597	3106.079726	3005.079524	3821.780341	22736.683001	4312.222878	3113.745970	3426.057113	3139.691538	15808.726325	3217.781184	2446.029709	2479.632023	2947.475182	4202.323957	3008.485190
std	1390.405062	924.872149	552.936664	686.367481	991.974076	1059.493773	5232.947390	861.309113	661.345376	740.826433	1223.697310	4625.023111	785.173638	1342.850878	1191.003546	1046.691982	409.025964	1219.619389
min	515.000000	584.000000	1998.000000	1373.000000	946.000000	642.000000	13318.000000	1685.000000	1239.000000	1703.000000	551.000000	3087.000000	1237.000000	58.000000	30.000000	1057.000000	2256.000000	283.000000
25%	1962.000000	2334.000000	3383.000000	2538.500000	2119.000000	3176.000000	18746.000000	3642.000000	2628.000000	2728.000000	2071.000000	11880.000000	2545.750000	1581.000000	1620.000000	2149.000000	3935.750000	2093.000000
50%	2911.500000	3372.000000	3780.000000	3201.000000	2941.000000	4193.500000	20910.000000	4573.000000	2908.000000	3525.000000	2991.000000	17055.000000	3515.000000	1859.500000	2069.000000	2581.000000	4215.000000	2873.000000
75%	4598.000000	3913.000000	4142.000000	3616.000000	3764.000000	4569.000000	28136.000000	4953.000000	3716.000000	4036.000000	4293.000000	19765.000000	3773.000000	2775.250000	3965.500000	4037.000000	4486.000000	3863.000000
max	5761.000000	5119.000000	5811.000000	4936.000000	5798.000000	5826.000000	35793.000000	6136.000000	4814.000000	5267.000000	5998.000000	25873.000000	4879.000000	6348.000000	5061.000000	5562.000000	5547.000000	6072.000000

We should also review yje data to see if there are any missing values.

pd.isnull(perfume_score).any()

Narcissus              False
Agrumen                False
Oud                    False
Jasmine                False
Amber                  False
Neroli                 False
Indole                 False
Vanilla                False
Frankincense           False
Bergamot               False
Galbanum               False
Magnolia               False
Sandalwood             False
Cashmeran              False
Citron                 False
Opopanax               False
Aliphatic Aldehydes    False
Vetiver                False
Scent Quality Score    False
dtype: bool

pd.isnull(perfume_preference).any()

Narcissus              True
Agrumen                True
Oud                    True
Jasmine                True
Amber                  True
Neroli                 True
Indole                 True
Vanilla                True
Frankincense           True
Bergamot               True
Galbanum               True
Magnolia               True
Sandalwood             True
Cashmeran              True
Citron                 True
Opopanax               True
Aliphatic Aldehydes    True
Vetiver                True
dtype: bool

Turns out there's a flaw in the perfume preference data.

3. Clean and tidy the data¶

Display a count of missing data

print(perfume_preference.isnull().sum().sum())

Visualising missing data

import missingno as msno
perfume_preference_columns = perfume_preference.iloc[:,:]
msno.matrix(perfume_preference_columns)

<matplotlib.axes._subplots.AxesSubplot at 0x1a85e7c40c8>

png

Draw a bar-plot to indicate the amount of missingdata in each feature

msno.bar(perfume_preference_columns)

<matplotlib.axes._subplots.AxesSubplot at 0x1a85eb2b208>

png

Impute missing data using the mean of other data from the same feature

perfume_preference.fillna(perfume_preference.mean(), inplace = True) 
perfume_preference

	Narcissus	Agrumen	Oud	Jasmine	Amber	Neroli	Indole	Vanilla	Frankincense	Bergamot	Galbanum	Magnolia	Sandalwood	Cashmeran	Citron	Opopanax	Aliphatic Aldehydes	Vetiver
0	1353.0	1252.0	4066.0	3838.0	2144.0	4404.0	32082.0	3866.0	2505.0	3972.0	4485.0	6441.0	4106.0	1722.0	4287.0	4820.0	4140.0	1463.0
1	1089.0	2152.0	4045.0	3710.0	2235.0	4352.0	30398.0	4769.0	2995.0	4720.0	4532.0	10931.0	3794.0	1638.0	4648.0	4472.0	4184.0	1071.0
2	4177.0	3592.0	3596.0	1745.0	3234.0	2116.0	21678.0	4864.0	3178.0	3381.0	1376.0	18153.0	2502.0	1733.0	1747.0	2728.0	4580.0	4742.0
3	4899.0	3738.0	2454.0	3976.0	4945.0	3853.0	17963.0	3040.0	2943.0	2870.0	4016.0	18819.0	1990.0	5118.0	2391.0	2012.0	3470.0	3057.0
4	4822.0	4030.0	3447.0	4225.0	4078.0	3772.0	23988.0	3389.0	2415.0	2695.0	3887.0	20367.0	2118.0	4530.0	2427.0	3205.0	4319.0	2289.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9492	4857.0	3654.0	2788.0	3830.0	4272.0	4349.0	20816.0	2524.0	2675.0	2550.0	4620.0	18506.0	1999.0	5329.0	2140.0	2568.0	4358.0	2361.0
9493	2040.0	2561.0	3913.0	3375.0	2058.0	4722.0	19399.0	4765.0	3796.0	2949.0	2686.0	13008.0	4201.0	1830.0	1534.0	2272.0	3348.0	2992.0
9494	4846.0	4883.0	4153.0	2108.0	4164.0	1881.0	20551.0	5030.0	2683.0	4001.0	1450.0	24684.0	3979.0	1187.0	2107.0	2508.0	4581.0	4731.0
9495	4310.0	3916.0	3937.0	2488.0	3343.0	2219.0	22914.0	5104.0	2640.0	3864.0	1730.0	19874.0	3654.0	499.0	1920.0	2971.0	4476.0	4654.0
9496	2698.0	3174.0	3984.0	3541.0	2522.0	4946.0	18512.0	5165.0	4167.0	2704.0	2536.0	16069.0	3851.0	1620.0	1830.0	2084.0	3240.0	3644.0

9497 rows × 18 columns

Do a final check by re-counting the amount of missing data

print(perfume_preference.isnull().sum().sum())

4. Review the data visually¶

from pandas.plotting import scatter_matrix
scatter_matrix(perfume_preference, figsize = (12,12));

png

scatter_matrix(perfume_score, figsize = (12,12));

png

Correlation matrix:

correlation_matrix = np.absolute(perfume_preference.corr().round(2))
sns.set(rc = {'figure.figsize': (10,10)})
ax = sns.heatmap(correlation_matrix, annot = True, cmap = 'Reds')
bottom,top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

(18.5, -0.5)

png

From this correlation map, we see that: - Vetiver is strongly correlated with Neroli and Citron - Opopanax is strongly correlated with Bergamot - ...

correlation_matrix = np.absolute(perfume_score.corr().round(2))
sns.set(rc = {'figure.figsize': (10,10)})
ax = sns.heatmap(correlation_matrix, annot = True, cmap = 'Reds')
bottom,top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

(19.5, -0.5)

png

From this correlation map, we see that: - Scent Quality Score is strongly correlated with Bergamot and Jasmine - Scent Quality Score is also affected by Aliphatic Aldehydes and Vanilla to some extent - Jasmine and Bergamot are correlated with each other. We should not use both Jasmine and Vanillafor building the model considering the 'double counting'their impact on the result - Also Oud and Cashmeran are correlated.

Therefore, we choose Jasmine, Vanilla and Aliphatic Aldhydes for building our first model.

Three key features(Jasmine, Vanilla and Aliphatic Aldehydes) in a 3D plot:

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(perfume_score['Jasmine'], perfume_score['Vanilla'], perfume_score['Aliphatic Aldehydes'], c = perfume_score['Scent Quality Score'], cmap = 'gist_heat')

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1a86c1b2908>

png

5. Polynominal regression¶

5.1 Select and split the data - processing the perfume score data¶

X = pd.DataFrame(np.c_[perfume_score['Jasmine'], perfume_score['Vanilla'], perfume_score['Aliphatic Aldehydes']], columns = ['Jasmine','Vanilla','Aliphatic Aldehydes'])
Y = perfume_score['Scent Quality Score']
X

	Jasmine	Vanilla	Aliphatic Aldehydes
0	315.377	479.027	783.264
1	210.582	507.113	906.204
2	288.379	611.463	705.294
3	151.414	469.357	678.681
4	353.518	439.266	622.090
...	...	...	...
4999	220.188	600.726	662.834
5000	268.901	583.384	819.217
5001	204.422	539.192	741.248
5002	147.307	510.981	804.591
5003	224.550	625.714	710.752

5004 rows × 3 columns

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
print(X_train[0:10])

      Jasmine  Vanilla  Aliphatic Aldehydes
4427  170.793  570.382              908.473
3248  158.091  416.565              641.899
2286  203.866  600.023              642.780
581   232.697  436.631              720.934
1188   90.112  538.792              814.159
2535   17.924  585.713              618.380
2151  252.522  624.270              621.184
113   332.608  570.309              561.070
3801   80.332  487.969              710.184
3430  362.441  550.813              800.493

5.2 Build the model¶

We first generate the features and take a look at the extended set of feature names:

from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree = 3)
X_train_poly = poly_features.fit_transform(X_train)
print(poly_features.get_feature_names(["Jasmine","Vanilla","Aliphatic Aldehydes"]))

['1', 'Jasmine', 'Vanilla', 'Aliphatic Aldehydes', 'Jasmine^2', 'Jasmine Vanilla', 'Jasmine Aliphatic Aldehydes', 'Vanilla^2', 'Vanilla Aliphatic Aldehydes', 'Aliphatic Aldehydes^2', 'Jasmine^3', 'Jasmine^2 Vanilla', 'Jasmine^2 Aliphatic Aldehydes', 'Jasmine Vanilla^2', 'Jasmine Vanilla Aliphatic Aldehydes', 'Jasmine Aliphatic Aldehydes^2', 'Vanilla^3', 'Vanilla^2 Aliphatic Aldehydes', 'Vanilla Aliphatic Aldehydes^2', 'Aliphatic Aldehydes^3']

Actual data:

X_train_poly[0:10]

array([[1.00000000e+00, 1.70793000e+02, 5.70382000e+02, 9.08473000e+02,
        2.91702488e+04, 9.74172529e+04, 1.55160829e+05, 3.25335626e+05,
        5.18176647e+05, 8.25323192e+05, 4.98207431e+06, 1.66381849e+07,
        2.65003835e+07, 5.55650476e+07, 8.85009440e+07, 1.40959424e+08,
        1.85565585e+08, 2.95558632e+08, 4.70749493e+08, 7.49783836e+08],
       [1.00000000e+00, 1.58091000e+02, 4.16565000e+02, 6.41899000e+02,
        2.49927643e+04, 6.58551774e+04, 1.01478455e+05, 1.73526399e+05,
        2.67392657e+05, 4.12034326e+05, 3.95113110e+06, 1.04111109e+07,
        1.60428304e+07, 2.74329620e+07, 4.22723725e+07, 6.51389187e+07,
        7.22850245e+07, 1.11386422e+08, 1.71639079e+08, 2.64484422e+08],
       [1.00000000e+00, 2.03866000e+02, 6.00023000e+02, 6.42780000e+02,
        4.15613460e+04, 1.22324289e+05, 1.31040987e+05, 3.60027601e+05,
        3.85682784e+05, 4.13166128e+05, 8.47294535e+06, 2.49377635e+07,
        2.67148020e+07, 7.33973868e+07, 7.86276064e+07, 8.42305259e+07,
        2.16024841e+08, 2.31418541e+08, 2.47909180e+08, 2.65574924e+08],
       [1.00000000e+00, 2.32697000e+02, 4.36631000e+02, 7.20934000e+02,
        5.41478938e+04, 1.01602724e+05, 1.67759179e+05, 1.90646630e+05,
        3.14782133e+05, 5.19745832e+05, 1.26000524e+07, 2.36426490e+07,
        3.90370577e+07, 4.43628989e+07, 7.32488581e+07, 1.20943296e+08,
        8.32422288e+07, 1.37443638e+08, 2.26937143e+08, 3.74702442e+08],
       [1.00000000e+00, 9.01120000e+01, 5.38792000e+02, 8.14159000e+02,
        8.12017254e+03, 4.85516247e+04, 7.33654958e+04, 2.90296819e+05,
        4.38662356e+05, 6.62854877e+05, 7.31724988e+05, 4.37508401e+06,
        6.61111156e+06, 2.61592270e+07, 3.95287422e+07, 5.97311787e+07,
        1.56409604e+08, 2.36347768e+08, 3.57140905e+08, 5.39669264e+08],
       [1.00000000e+00, 1.79240000e+01, 5.85713000e+02, 6.18380000e+02,
        3.21269776e+02, 1.04983198e+04, 1.10838431e+04, 3.43059718e+05,
        3.62193205e+05, 3.82393824e+05, 5.75843947e+03, 1.88171884e+05,
        1.98666804e+05, 6.14900239e+06, 6.49195101e+06, 6.85402691e+06,
        2.00934537e+08, 2.12141269e+08, 2.23973034e+08, 2.36464693e+08],
       [1.00000000e+00, 2.52522000e+02, 6.24270000e+02, 6.21184000e+02,
        6.37673605e+04, 1.57641909e+05, 1.56862626e+05, 3.89713033e+05,
        3.87786536e+05, 3.85869562e+05, 1.61026614e+07, 3.98080501e+07,
        3.96112641e+07, 9.84111145e+07, 9.79246316e+07, 9.74405535e+07,
        2.43286155e+08, 2.42083501e+08, 2.40886791e+08, 2.39695998e+08],
       [1.00000000e+00, 3.32608000e+02, 5.70309000e+02, 5.61070000e+02,
        1.10628082e+05, 1.89689336e+05, 1.86616371e+05, 3.25252355e+05,
        3.19983271e+05, 3.14799545e+05, 3.67957850e+07, 6.30921906e+07,
        6.20700978e+07, 1.08181535e+08, 1.06428996e+08, 1.04704847e+08,
        1.85494346e+08, 1.82489339e+08, 1.79533014e+08, 1.76624581e+08],
       [1.00000000e+00, 8.03320000e+01, 4.87969000e+02, 7.10184000e+02,
        6.45323022e+03, 3.91995257e+04, 5.70505011e+04, 2.38113745e+05,
        3.46547776e+05, 5.04361314e+05, 5.18400890e+05, 3.14897630e+06,
        4.58298085e+06, 1.91281534e+07, 2.78388760e+07, 4.05163531e+07,
        1.16192126e+08, 1.69104572e+08, 2.46112686e+08, 3.58189335e+08],
       [1.00000000e+00, 3.62441000e+02, 5.50813000e+02, 8.00493000e+02,
        1.31363478e+05, 1.99637215e+05, 2.90131483e+05, 3.03394961e+05,
        4.40921951e+05, 6.40789043e+05, 4.76115105e+07, 7.23567117e+07,
        1.05155545e+08, 1.09962773e+08, 1.59808193e+08, 2.32248222e+08,
        1.67113889e+08, 2.42865542e+08, 3.52954935e+08, 5.12947143e+08]])

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

polynomial_model = LinearRegression()
polynomial_model.fit(X_train_poly, Y_train)
print("Model coefficients = ", polynomial_model.coef_)
print("Constant term (bias) = ", polynomial_model.intercept_)

Model coefficients =  [ 0.00000000e+00  8.44956564e+03 -7.03926860e+02  3.91781726e+03
 -7.27929661e+00 -8.46412648e+00 -1.17297723e+01  1.85161715e+00
  9.12487891e+00 -5.89023638e+00  7.03479781e-04 -1.54576830e-03
  9.41465525e-03  9.78622737e-03  6.73304083e-02  6.58873295e-03
 -5.15228130e-03  6.53665385e-03 -5.71172445e-03  4.06079530e-03]
Constant term (bias) =  878612.6569970567

5.3 Test the model¶

We can apply this model to both the original traning data and to the test data:

y_train_predicted = polynomial_model.predict(X_train_poly)
y_test_predict = polynomial_model.predict(poly_features.fit_transform(X_test))

Then measure the model quality for each case:

rmse_train = np.sqrt(mean_squared_error(Y_train, y_train_predicted))
r2_train = r2_score(Y_train, y_train_predicted)

rmse_test = np.sqrt(mean_squared_error(Y_test,y_test_predict))
r2_test = r2_score(Y_test, y_test_predict)

print("R2:")
print("Train = ", r2_train)
print("Test = ", r2_test)
print("RMSE:")
print("Train = ", rmse_train)
print("Test = ", rmse_test)

R2:
Train =  0.9791796776657181
Test =  0.9814345530715797
RMSE:
Train =  412795.4591276655
Test =  393860.84246070404

Then we get the results of two regression evaluation indicators - mean square error root (RMSE) and R squared (R2).

6. Clustering Customers¶

6.1 Standardize the data into a standard size¶

perfume_preference[0:10]

	Narcissus	Agrumen	Oud	Jasmine	Amber	Neroli	Indole	Vanilla	Frankincense	Bergamot	Galbanum	Magnolia	Sandalwood	Cashmeran	Citron	Opopanax	Aliphatic Aldehydes	Vetiver
0	1353.0	1252.0	4066.0	3838.0	2144.0	4404.0	32082.0	3866.0	2505.0	3972.0	4485.0	6441.0	4106.0	1722.0	4287.0	4820.0	4140.0	1463.0
1	1089.0	2152.0	4045.0	3710.0	2235.0	4352.0	30398.0	4769.0	2995.0	4720.0	4532.0	10931.0	3794.0	1638.0	4648.0	4472.0	4184.0	1071.0
2	4177.0	3592.0	3596.0	1745.0	3234.0	2116.0	21678.0	4864.0	3178.0	3381.0	1376.0	18153.0	2502.0	1733.0	1747.0	2728.0	4580.0	4742.0
3	4899.0	3738.0	2454.0	3976.0	4945.0	3853.0	17963.0	3040.0	2943.0	2870.0	4016.0	18819.0	1990.0	5118.0	2391.0	2012.0	3470.0	3057.0
4	4822.0	4030.0	3447.0	4225.0	4078.0	3772.0	23988.0	3389.0	2415.0	2695.0	3887.0	20367.0	2118.0	4530.0	2427.0	3205.0	4319.0	2289.0
5	2251.0	2305.0	4058.0	3330.0	1775.0	4882.0	16567.0	5148.0	4443.0	2472.0	2615.0	11655.0	3061.0	1549.0	1563.0	1709.0	3426.0	3003.0
6	1661.0	2199.0	4994.0	2795.0	2231.0	4108.0	31511.0	3584.0	2771.0	4153.0	4462.0	11061.0	3791.0	2123.0	4528.0	4716.0	4124.0	2016.0
7	4690.0	3674.0	3827.0	2130.0	3483.0	2544.0	21010.0	4284.0	2457.0	3610.0	1819.0	18601.0	3917.0	2129.0	1609.0	2614.0	3879.0	3962.0
8	4735.0	3236.0	3255.0	3349.0	4221.0	4038.0	22356.0	3202.0	2804.0	2754.0	2968.0	16407.0	1899.0	4813.0	2878.0	2862.0	3125.0	2692.0
9	1259.0	2679.0	3541.0	3159.0	1937.0	4619.0	31967.0	4662.0	2797.0	3840.0	5327.0	13610.0	3317.0	1889.0	4457.0	4795.0	4390.0	1690.0

standardized_customer_data = preprocessing.scale(perfume_preference)
standardized_customer_data_df = pd.DataFrame(standardized_customer_data,columns = perfume_preference.columns)

6.2 Decide the number of clusters - Elbow method¶

import matplotlib.pyplot as plt

sse = []
for k in range(1,11):
    kmeans = KMeans(n_clusters = k,)
    kmeans.fit(standardized_customer_data_df)
    sse.append(kmeans.inertia_)
x = range(1,11)
plt.xlabel('K')
plt.ylabel('SSE')
plt.plot(x,sse,'o-')
plt.show()

png

The 'elbow' value on the graph indicates the optimum number of clusters. The number of clusters here is 4.

6.3 Cluster the data (build the model)¶

First we create the object('machine') that we will use to build the model.

kmeans = KMeans(n_clusters = 4)

Then we use that object to identify clusters in the data.

kmeans.fit(standardized_customer_data_df)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

y_km = kmeans.fit_predict(standardized_customer_data_df)

6.4 Reviw the Results¶

We want to know: for each group, how tightly clustered they are. That is, identify variances for each these groups.

km0_var = np.var(y_km ==0)
km1_var = np.var(y_km ==1)
km2_var = np.var(y_km ==2)
km3_var = np.var(y_km ==3)
print("Variance of group 0 is %f" % km0_var)
print("Variance of group 1 is %f" % km1_var)
print("Variance of group 2 is %f" % km2_var)
print("Variance of group 3 is %f" % km3_var)

Variance of group 0 is 0.188117
Variance of group 1 is 0.187171
Variance of group 2 is 0.190040
Variance of group 3 is 0.184611

Group 1 is the tightest, whereas group 2 is rather loose.

Specifically, for any customer group identified indicate the spread (variance) for each group for each dimension (quantity of scent chemical)：

km0_che_var = np.var(perfume_preference[y_km == 0], axis=0) 
print("Variance for each dimension of group 0 is: ")
km0_che_var

Variance for each dimension of group 0 is:





Narcissus              8.070874e+04
Agrumen                1.860085e+05
Oud                    1.247092e+05
Jasmine                1.376372e+05
Amber                  1.754141e+05
Neroli                 9.981178e+04
Indole                 1.814991e+06
Vanilla                1.461047e+05
Frankincense           4.453063e+04
Bergamot               1.587021e+05
Galbanum               5.220495e+04
Magnolia               4.646825e+06
Sandalwood             8.379077e+04
Cashmeran              8.081125e+04
Citron                 1.889976e+05
Opopanax               7.244132e+04
Aliphatic Aldehydes    1.835542e+05
Vetiver                1.363927e+05
dtype: float64

We can see that they are similar with regard to some chemicals but have a wide range of responses to other scent chemicals.

The other three groups did much the same:

km1_che_var = np.var(perfume_preference[y_km == 1], axis=0) 
print("Variance for each dimension of group 1 is: ")
km1_che_var

Variance for each dimension of group 1 is:





Narcissus              1.041274e+05
Agrumen                6.039455e+04
Oud                    1.338378e+05
Jasmine                1.140873e+05
Amber                  1.361587e+05
Neroli                 1.831613e+05
Indole                 2.477114e+06
Vanilla                9.723437e+04
Frankincense           4.640931e+04
Bergamot               5.826429e+04
Galbanum               2.036101e+05
Magnolia               1.511061e+06
Sandalwood             4.278111e+04
Cashmeran              1.827669e+05
Citron                 8.646526e+04
Opopanax               9.921890e+04
Aliphatic Aldehydes    1.151117e+05
Vetiver                1.233736e+05
dtype: float64

km2_che_var = np.var(perfume_preference[y_km == 2], axis=0) 
print("Variance for each dimension of group 2 is: ")
km2_che_var

Variance for each dimension of group 2 is:





Narcissus              9.723301e+04
Agrumen                1.944770e+05
Oud                    1.962811e+05
Jasmine                1.289577e+05
Amber                  6.918558e+04
Neroli                 4.724214e+04
Indole                 1.778136e+06
Vanilla                1.525133e+05
Frankincense           1.786047e+05
Bergamot               7.893402e+04
Galbanum               1.276341e+05
Magnolia               4.858489e+06
Sandalwood             6.911784e+04
Cashmeran              6.365879e+04
Citron                 4.415088e+04
Opopanax               7.111018e+04
Aliphatic Aldehydes    9.197844e+04
Vetiver                1.581174e+05
dtype: float64

km3_che_var = np.var(perfume_preference[y_km == 3], axis=0) 
print("Variance for each dimension of group 3 is: ")
km3_che_var

Variance for each dimension of group 3 is:





Narcissus              1.219228e+05
Agrumen                9.138574e+04
Oud                    1.677518e+05
Jasmine                5.476186e+04
Amber                  5.526990e+04
Neroli                 1.466144e+05
Indole                 3.613530e+06
Vanilla                1.238448e+05
Frankincense           1.504654e+05
Bergamot               4.178902e+04
Galbanum               7.548093e+04
Magnolia               2.287027e+06
Sandalwood             1.405492e+05
Cashmeran              1.905020e+05
Citron                 1.384131e+05
Opopanax               1.444139e+05
Aliphatic Aldehydes    8.179831e+04
Vetiver                1.320200e+05
dtype: float64

print(y_km[0:20])

[2 2 3 1 1 0 2 3 1 2 3 2 2 2 3 1 0 0 1 2]

print(perfume_preference[y_km == 2][0:5])

    Narcissus  Agrumen     Oud  Jasmine   Amber  Neroli   Indole  Vanilla  \
0      1353.0   1252.0  4066.0   3838.0  2144.0  4404.0  32082.0   3866.0   
1      1089.0   2152.0  4045.0   3710.0  2235.0  4352.0  30398.0   4769.0   
6      1661.0   2199.0  4994.0   2795.0  2231.0  4108.0  31511.0   3584.0   
9      1259.0   2679.0  3541.0   3159.0  1937.0  4619.0  31967.0   4662.0   
11     1683.0   2078.0  3989.0   2873.0  2038.0  4309.0  30393.0   3648.0

    Frankincense  Bergamot  Galbanum  Magnolia  Sandalwood  Cashmeran  Citron  \
0         2505.0    3972.0    4485.0    6441.0      4106.0     1722.0  4287.0   
1         2995.0    4720.0    4532.0   10931.0      3794.0     1638.0  4648.0   
6         2771.0    4153.0    4462.0   11061.0      3791.0     2123.0  4528.0   
9         2797.0    3840.0    5327.0   13610.0      3317.0     1889.0  4457.0   
11        3011.0    4418.0    4092.0   10568.0      3768.0     1746.0  4483.0

    Opopanax  Aliphatic Aldehydes  Vetiver  
0     4820.0               4140.0   1463.0  
1     4472.0               4184.0   1071.0  
6     4716.0               4124.0   2016.0  
9     4795.0               4390.0   1690.0  
11    4495.0               4602.0   2307.0

We can look at the clusters in chart form.

#Jasmine vs Vanilla
plt.figure(figsize = (7,7))
plt.scatter(perfume_preference[y_km ==0]['Jasmine'], perfume_preference[y_km == 0]['Vanilla'],
           s = 15,c = 'red',alpha = .5)
plt.scatter(perfume_preference[y_km ==1]['Jasmine'], perfume_preference[y_km == 1]['Vanilla'],
           s = 15,c = 'black',alpha = .5)
plt.scatter(perfume_preference[y_km ==2]['Jasmine'], perfume_preference[y_km == 2]['Vanilla'],
           s = 15,c = 'blue',alpha = .5)
plt.scatter(perfume_preference[y_km ==3]['Jasmine'], perfume_preference[y_km == 3]['Vanilla'],
           s = 15,c = 'cyan',alpha = .5)

<matplotlib.collections.PathCollection at 0x1a86c2add48>

png

Take a '3D' view of the data

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

%matplotlib
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(111,projection = '3d')
ax.view_init(20,20)
ax.set_xlabel('Jasmine')
ax.set_ylabel('Vanilla')
ax.set_zlabel('Aliphatic Aldehydes')

ax.scatter(perfume_preference[y_km ==0]['Jasmine'], perfume_preference[y_km == 0]['Vanilla'],
           perfume_preference[y_km ==0]['Aliphatic Aldehydes'], s = 15,c = 'red',alpha = .3)
ax.scatter(perfume_preference[y_km ==1]['Jasmine'], perfume_preference[y_km == 1]['Vanilla'],
           perfume_preference[y_km ==1]['Aliphatic Aldehydes'], s = 15,c = 'black',alpha = .3)
ax.scatter(perfume_preference[y_km ==2]['Jasmine'], perfume_preference[y_km == 2]['Vanilla'],
           perfume_preference[y_km ==2]['Aliphatic Aldehydes'], s = 15,c = 'blue',alpha = .3)
ax.scatter(perfume_preference[y_km ==3]['Jasmine'], perfume_preference[y_km == 3]['Vanilla'],
           perfume_preference[y_km ==3]['Aliphatic Aldehydes'], s = 15,c = 'cyan',alpha = .3)

Using matplotlib backend: Qt5Agg





<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1a872c3bc88>

png

The central point of K-means:

centers = kmeans.cluster_centers_
centers

array([[-0.63508675, -0.41641312,  0.1801924 ,  0.10214018, -0.71073531,
         0.92559669, -0.91458644,  0.69966415,  1.49956699, -0.59666119,
        -0.60720389, -0.41660319,  0.57117372, -0.48341316, -0.74077772,
        -0.91436421, -0.42815036,  0.10413989],
       [ 1.01512708,  0.75401605, -1.09476607,  1.04776978,  1.37499078,
         0.14680867, -0.54262287, -1.52078422, -0.61896137, -1.13774394,
         0.62329158,  0.75422437, -1.61128735,  1.66687438, -0.18799296,
        -0.54276709, -0.53338134, -0.30061661],
       [-1.2345529 , -1.27841445,  0.84571864,  0.21695373, -1.00251791,
         0.4677676 ,  1.5634698 ,  0.12346678, -0.43796367,  1.19821614,
         1.18028798, -1.27839257,  0.56702368, -0.50325856,  1.59486712,
         1.56359936,  0.82022759, -1.19924176],
       [ 0.90638695,  0.99381461,  0.04883221, -1.40107939,  0.37454187,
        -1.59030928, -0.13842807,  0.70373206, -0.45305188,  0.52350583,
        -1.24439817,  0.99377463,  0.46500412, -0.67859451, -0.71193968,
        -0.13864473,  0.12810446,  1.45221773]])

We can visualize these arrays:

from matplotlib.pyplot import figure

figure(figsize = (20, 10))
x = np.array(range(0, 18))
y = np.array([[ 0.90638695,  0.99381461,  0.04883221, -1.40107939,  0.37454187,
        -1.59030928, -0.13842807,  0.70373206, -0.45305188,  0.52350583,
        -1.24439817,  0.99377463,  0.46500412, -0.67859451, -0.71193968,
        -0.13864473,  0.12810446,  1.45221773],
       [-1.2345529 , -1.27841445,  0.84571864,  0.21695373, -1.00251791,
         0.4677676 ,  1.5634698 ,  0.12346678, -0.43796367,  1.19821614,
         1.18028798, -1.27839257,  0.56702368, -0.50325856,  1.59486712,
         1.56359936,  0.82022759, -1.19924176],
       [ 1.01512708,  0.75401605, -1.09476607,  1.04776978,  1.37499078,
         0.14680867, -0.54262287, -1.52078422, -0.61896137, -1.13774394,
         0.62329158,  0.75422437, -1.61128735,  1.66687438, -0.18799296,
        -0.54276709, -0.53338134, -0.30061661],
       [-0.63508675, -0.41641312,  0.1801924 ,  0.10214018, -0.71073531,
         0.92559669, -0.91458644,  0.69966415,  1.49956699, -0.59666119,
        -0.60720389, -0.41660319,  0.57117372, -0.48341316, -0.74077772,
        -0.91436421, -0.42815036,  0.10413989]])
plt.title("Plotting Central Points")
plt.xlabel("Features")
plt.ylabel("Preference")

for i, array in enumerate(y):
    plt.scatter(x, array, s = (150, ), color = np.random.rand(3, ), marker = "o", label = f"Array #{i}")

plt.legend(loc = "center left", bbox_to_anchor=(1, 0.5))
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17],[r'$Narcissus$', r'$Agrumen$', r'$Oud$', r'$Jasmine$', r'$Amber$', 
                                                          r'$Neroli$', r'$Indole$', r'$Vanilla$', r'$Frankincen$', r'$Bergamot$', 
                                                          r'$Galbanum$', r'$Magnolia$', r'$Sandalwood$', r'$Cashmeran$', 
                                                          r'$Citron$', r'$Opopanax$', r'$Aliphatic Aldehydes$', r'$Vetiver$'])
plt.show()

png

We define that if the coefficient is positive, then the feature is preferred by group customers.

Best mixture for each group:

Group 0: Narcissus, Agrumen, Oud, Amber, Vanilla, Bergamot, Magnolia, Sandalwood, Aliphatic Aldehydes, Vetiver
Group 1: Oud, Jasmine, Neroli, Indole, Vanilla, Bergamot, Galbanum, Sandalwood, Citron, Opopanax, Aliphatic Aldehydes
Group 2: Narcissus, Agrumen, Jasmine, Amber, Neroli, Galbanum, Magnolia, Cashmeran
Group 3: Oud, Jasmine, Neroli, Vanilla, Frankincen, Sandalwood, Vetiver