1. Load the data-set¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
We need to lead the perfume dataset from sklearn
perfume_preference = pd.read_csv("Perfume preference.csv")
perfume_preference
| Narcissus | Agrumen | Oud | Jasmine | Amber | Neroli | Indole | Vanilla | Frankincense | Bergamot | Galbanum | Magnolia | Sandalwood | Cashmeran | Citron | Opopanax | Aliphatic Aldehydes | Vetiver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1353.0 | 1252.0 | 4066.0 | 3838.0 | 2144.0 | 4404.0 | 32082.0 | 3866.0 | 2505.0 | 3972.0 | 4485.0 | 6441.0 | 4106.0 | 1722.0 | 4287.0 | 4820.0 | 4140.0 | 1463.0 |
| 1 | 1089.0 | 2152.0 | 4045.0 | 3710.0 | 2235.0 | 4352.0 | 30398.0 | 4769.0 | 2995.0 | 4720.0 | 4532.0 | 10931.0 | 3794.0 | 1638.0 | 4648.0 | 4472.0 | 4184.0 | 1071.0 |
| 2 | 4177.0 | 3592.0 | 3596.0 | 1745.0 | 3234.0 | 2116.0 | 21678.0 | 4864.0 | 3178.0 | 3381.0 | 1376.0 | 18153.0 | 2502.0 | 1733.0 | 1747.0 | 2728.0 | 4580.0 | 4742.0 |
| 3 | 4899.0 | 3738.0 | 2454.0 | 3976.0 | 4945.0 | 3853.0 | 17963.0 | 3040.0 | 2943.0 | 2870.0 | 4016.0 | 18819.0 | 1990.0 | 5118.0 | 2391.0 | 2012.0 | 3470.0 | 3057.0 |
| 4 | 4822.0 | 4030.0 | 3447.0 | 4225.0 | 4078.0 | 3772.0 | 23988.0 | 3389.0 | 2415.0 | 2695.0 | 3887.0 | 20367.0 | 2118.0 | 4530.0 | 2427.0 | 3205.0 | 4319.0 | 2289.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9492 | 4857.0 | 3654.0 | 2788.0 | 3830.0 | 4272.0 | 4349.0 | 20816.0 | 2524.0 | 2675.0 | 2550.0 | 4620.0 | 18506.0 | 1999.0 | 5329.0 | 2140.0 | 2568.0 | 4358.0 | 2361.0 |
| 9493 | 2040.0 | 2561.0 | 3913.0 | 3375.0 | 2058.0 | 4722.0 | 19399.0 | 4765.0 | 3796.0 | 2949.0 | 2686.0 | 13008.0 | 4201.0 | 1830.0 | 1534.0 | 2272.0 | 3348.0 | 2992.0 |
| 9494 | 4846.0 | 4883.0 | 4153.0 | 2108.0 | 4164.0 | 1881.0 | 20551.0 | 5030.0 | 2683.0 | 4001.0 | 1450.0 | 24684.0 | 3979.0 | 1187.0 | 2107.0 | 2508.0 | 4581.0 | 4731.0 |
| 9495 | 4310.0 | 3916.0 | 3937.0 | 2488.0 | 3343.0 | 2219.0 | 22914.0 | 5104.0 | 2640.0 | 3864.0 | 1730.0 | 19874.0 | 3654.0 | 499.0 | 1920.0 | 2971.0 | 4476.0 | 4654.0 |
| 9496 | 2698.0 | 3174.0 | 3984.0 | 3541.0 | 2522.0 | 4946.0 | 18512.0 | 5165.0 | 4167.0 | 2704.0 | 2536.0 | 16069.0 | 3851.0 | 1620.0 | 1830.0 | 2084.0 | 3240.0 | 3644.0 |
9497 rows × 18 columns
perfume_score = pd.read_csv("Perfume Score.csv")
perfume_score
| Narcissus | Agrumen | Oud | Jasmine | Amber | Neroli | Indole | Vanilla | Frankincense | Bergamot | Galbanum | Magnolia | Sandalwood | Cashmeran | Citron | Opopanax | Aliphatic Aldehydes | Vetiver | Scent Quality Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 489.766 | 343.510 | 638.519 | 315.377 | 966.417 | 913.256 | 1015.036 | 479.027 | 485.797 | 2918.050062 | 108.538 | 727.438 | 936.842 | 4801.306119 | 261.952 | 148.593 | 783.264 | 809.541 | 1.302700e+07 |
| 1 | 472.841 | 218.288 | 642.332 | 210.582 | 995.068 | 989.447 | 958.614 | 507.113 | 242.015 | 2119.074840 | 246.654 | 755.477 | 840.936 | 4896.315590 | 149.498 | 44.490 | 906.204 | 815.512 | 1.159073e+07 |
| 2 | 472.620 | 323.480 | 696.770 | 288.379 | 1006.334 | 875.163 | 987.398 | 611.463 | 410.451 | 2679.139347 | 281.022 | 729.155 | 825.386 | 5350.521973 | 177.980 | 141.612 | 705.294 | 794.394 | 1.367693e+07 |
| 3 | 503.155 | 397.632 | 644.533 | 151.414 | 960.097 | 905.462 | 1031.227 | 469.357 | 388.405 | 1784.035393 | 280.953 | 711.906 | 786.198 | 5029.939322 | 29.515 | 149.231 | 678.681 | 837.614 | 7.997427e+06 |
| 4 | 499.780 | 344.096 | 643.764 | 353.518 | 1033.988 | 978.976 | 871.312 | 439.266 | 311.002 | 3236.214279 | 272.058 | 737.003 | 898.238 | 4988.788504 | 138.884 | 122.238 | 622.090 | 824.174 | 1.113290e+07 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4999 | 449.162 | 353.896 | 680.031 | 220.188 | 940.716 | 851.543 | 951.874 | 600.726 | 383.718 | 2209.955448 | 122.345 | 852.887 | 973.806 | 5122.835105 | 50.474 | 87.481 | 662.834 | 821.282 | 1.072665e+07 |
| 5000 | 526.781 | 392.868 | 652.819 | 268.901 | 983.403 | 700.787 | 1031.042 | 583.384 | 414.174 | 2453.815268 | 368.325 | 770.897 | 825.038 | 5009.288848 | 195.544 | 83.047 | 819.217 | 830.439 | 1.426609e+07 |
| 5001 | 475.160 | 256.740 | 655.360 | 204.422 | 905.181 | 1055.073 | 1008.550 | 539.192 | 399.411 | 2007.515839 | 192.985 | 774.179 | 747.784 | 4925.275302 | 205.319 | 143.601 | 741.248 | 780.727 | 9.882660e+06 |
| 5002 | 481.422 | 278.652 | 647.467 | 147.307 | 1033.814 | 880.379 | 1053.847 | 510.981 | 410.661 | 1762.999938 | 144.866 | 802.051 | 890.813 | 4992.597380 | 116.158 | 80.665 | 804.591 | 792.583 | 9.200338e+06 |
| 5003 | 476.130 | 364.371 | 659.429 | 224.550 | 1207.776 | 837.054 | 882.858 | 625.714 | 361.077 | 2175.815594 | 231.943 | 621.123 | 737.455 | 4939.983539 | 231.950 | 178.103 | 710.752 | 867.652 | 1.195802e+07 |
5004 rows × 19 columns
2. Review the data quatitatively¶
perfume_score.describe()
| Narcissus | Agrumen | Oud | Jasmine | Amber | Neroli | Indole | Vanilla | Frankincense | Bergamot | Galbanum | Magnolia | Sandalwood | Cashmeran | Citron | Opopanax | Aliphatic Aldehydes | Vetiver | Scent Quality Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5004.000000 | 5.004000e+03 |
| mean | 470.922058 | 346.112136 | 674.849797 | 209.349424 | 988.363780 | 888.163961 | 944.930020 | 564.445091 | 378.302281 | 2186.942959 | 250.475366 | 769.066442 | 866.283868 | 5172.149509 | 121.493370 | 118.018880 | 703.506534 | 802.327559 | 1.079593e+07 |
| std | 23.038942 | 59.788428 | 36.724524 | 80.932744 | 74.334501 | 84.265546 | 70.080494 | 72.778522 | 60.988463 | 575.210895 | 76.697327 | 89.079681 | 87.392412 | 275.760510 | 70.881267 | 62.237022 | 99.694353 | 25.963051 | 2.867554e+06 |
| min | 383.651000 | 121.396000 | 543.403000 | 0.000000 | 652.234000 | 539.166000 | 683.213000 | 287.286000 | 142.905000 | 487.811886 | 0.000000 | 457.725000 | 545.930000 | 4119.640577 | 0.000000 | 0.000000 | 317.364000 | 714.678000 | 3.860472e+06 |
| 25% | 455.292000 | 305.453750 | 650.423250 | 154.964250 | 938.166000 | 830.146500 | 896.683000 | 514.985750 | 337.244250 | 1804.401369 | 198.506500 | 707.927750 | 809.248500 | 4990.344908 | 69.145750 | 74.265250 | 635.601500 | 784.980750 | 8.777066e+06 |
| 50% | 470.695500 | 346.160500 | 675.707000 | 209.278000 | 989.786500 | 889.092500 | 945.116500 | 563.099000 | 378.364000 | 2191.656699 | 249.945000 | 770.214000 | 867.822500 | 5174.842876 | 118.840500 | 117.010500 | 703.989000 | 802.968000 | 1.057777e+07 |
| 75% | 486.797750 | 386.261000 | 699.350250 | 262.900250 | 1038.281500 | 945.673750 | 993.838500 | 612.617000 | 418.213250 | 2571.954572 | 301.876250 | 830.076000 | 924.809000 | 5358.193002 | 170.583000 | 160.389500 | 769.465500 | 819.953500 | 1.255241e+07 |
| max | 548.708000 | 562.238000 | 817.061000 | 481.593000 | 1258.446000 | 1183.733000 | 1214.694000 | 825.775000 | 646.347000 | 4079.337285 | 535.614000 | 1116.747000 | 1195.179000 | 6231.998892 | 396.534000 | 348.029000 | 1049.738000 | 888.987000 | 2.203800e+07 |
perfume_preference.describe()
| Narcissus | Agrumen | Oud | Jasmine | Amber | Neroli | Indole | Vanilla | Frankincense | Bergamot | Galbanum | Magnolia | Sandalwood | Cashmeran | Citron | Opopanax | Aliphatic Aldehydes | Vetiver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 9494.000000 | 9493.000000 | 9489.000000 | 9495.000000 | 9494.000000 | 9492.000000 | 9489.000000 | 9494.000000 | 9491.000000 | 9490.000000 | 9489.000000 | 9493.000000 | 9492.000000 | 9492.000000 | 9487.000000 | 9489.000000 | 9492.000000 | 9487.000000 |
| mean | 3265.382663 | 3121.755083 | 3763.242597 | 3106.079726 | 3005.079524 | 3821.780341 | 22736.683001 | 4312.222878 | 3113.745970 | 3426.057113 | 3139.691538 | 15808.726325 | 3217.781184 | 2446.029709 | 2479.632023 | 2947.475182 | 4202.323957 | 3008.485190 |
| std | 1390.405062 | 924.872149 | 552.936664 | 686.367481 | 991.974076 | 1059.493773 | 5232.947390 | 861.309113 | 661.345376 | 740.826433 | 1223.697310 | 4625.023111 | 785.173638 | 1342.850878 | 1191.003546 | 1046.691982 | 409.025964 | 1219.619389 |
| min | 515.000000 | 584.000000 | 1998.000000 | 1373.000000 | 946.000000 | 642.000000 | 13318.000000 | 1685.000000 | 1239.000000 | 1703.000000 | 551.000000 | 3087.000000 | 1237.000000 | 58.000000 | 30.000000 | 1057.000000 | 2256.000000 | 283.000000 |
| 25% | 1962.000000 | 2334.000000 | 3383.000000 | 2538.500000 | 2119.000000 | 3176.000000 | 18746.000000 | 3642.000000 | 2628.000000 | 2728.000000 | 2071.000000 | 11880.000000 | 2545.750000 | 1581.000000 | 1620.000000 | 2149.000000 | 3935.750000 | 2093.000000 |
| 50% | 2911.500000 | 3372.000000 | 3780.000000 | 3201.000000 | 2941.000000 | 4193.500000 | 20910.000000 | 4573.000000 | 2908.000000 | 3525.000000 | 2991.000000 | 17055.000000 | 3515.000000 | 1859.500000 | 2069.000000 | 2581.000000 | 4215.000000 | 2873.000000 |
| 75% | 4598.000000 | 3913.000000 | 4142.000000 | 3616.000000 | 3764.000000 | 4569.000000 | 28136.000000 | 4953.000000 | 3716.000000 | 4036.000000 | 4293.000000 | 19765.000000 | 3773.000000 | 2775.250000 | 3965.500000 | 4037.000000 | 4486.000000 | 3863.000000 |
| max | 5761.000000 | 5119.000000 | 5811.000000 | 4936.000000 | 5798.000000 | 5826.000000 | 35793.000000 | 6136.000000 | 4814.000000 | 5267.000000 | 5998.000000 | 25873.000000 | 4879.000000 | 6348.000000 | 5061.000000 | 5562.000000 | 5547.000000 | 6072.000000 |
We should also review yje data to see if there are any missing values.
pd.isnull(perfume_score).any()
Narcissus False
Agrumen False
Oud False
Jasmine False
Amber False
Neroli False
Indole False
Vanilla False
Frankincense False
Bergamot False
Galbanum False
Magnolia False
Sandalwood False
Cashmeran False
Citron False
Opopanax False
Aliphatic Aldehydes False
Vetiver False
Scent Quality Score False
dtype: bool
pd.isnull(perfume_preference).any()
Narcissus True
Agrumen True
Oud True
Jasmine True
Amber True
Neroli True
Indole True
Vanilla True
Frankincense True
Bergamot True
Galbanum True
Magnolia True
Sandalwood True
Cashmeran True
Citron True
Opopanax True
Aliphatic Aldehydes True
Vetiver True
dtype: bool
Turns out there's a flaw in the perfume preference data.
3. Clean and tidy the data¶
Display a count of missing data
print(perfume_preference.isnull().sum().sum())
104
Visualising missing data
import missingno as msno
perfume_preference_columns = perfume_preference.iloc[:,:]
msno.matrix(perfume_preference_columns)
<matplotlib.axes._subplots.AxesSubplot at 0x1a85e7c40c8>

Draw a bar-plot to indicate the amount of missingdata in each feature
msno.bar(perfume_preference_columns)
<matplotlib.axes._subplots.AxesSubplot at 0x1a85eb2b208>

Impute missing data using the mean of other data from the same feature
perfume_preference.fillna(perfume_preference.mean(), inplace = True)
perfume_preference
| Narcissus | Agrumen | Oud | Jasmine | Amber | Neroli | Indole | Vanilla | Frankincense | Bergamot | Galbanum | Magnolia | Sandalwood | Cashmeran | Citron | Opopanax | Aliphatic Aldehydes | Vetiver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1353.0 | 1252.0 | 4066.0 | 3838.0 | 2144.0 | 4404.0 | 32082.0 | 3866.0 | 2505.0 | 3972.0 | 4485.0 | 6441.0 | 4106.0 | 1722.0 | 4287.0 | 4820.0 | 4140.0 | 1463.0 |
| 1 | 1089.0 | 2152.0 | 4045.0 | 3710.0 | 2235.0 | 4352.0 | 30398.0 | 4769.0 | 2995.0 | 4720.0 | 4532.0 | 10931.0 | 3794.0 | 1638.0 | 4648.0 | 4472.0 | 4184.0 | 1071.0 |
| 2 | 4177.0 | 3592.0 | 3596.0 | 1745.0 | 3234.0 | 2116.0 | 21678.0 | 4864.0 | 3178.0 | 3381.0 | 1376.0 | 18153.0 | 2502.0 | 1733.0 | 1747.0 | 2728.0 | 4580.0 | 4742.0 |
| 3 | 4899.0 | 3738.0 | 2454.0 | 3976.0 | 4945.0 | 3853.0 | 17963.0 | 3040.0 | 2943.0 | 2870.0 | 4016.0 | 18819.0 | 1990.0 | 5118.0 | 2391.0 | 2012.0 | 3470.0 | 3057.0 |
| 4 | 4822.0 | 4030.0 | 3447.0 | 4225.0 | 4078.0 | 3772.0 | 23988.0 | 3389.0 | 2415.0 | 2695.0 | 3887.0 | 20367.0 | 2118.0 | 4530.0 | 2427.0 | 3205.0 | 4319.0 | 2289.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9492 | 4857.0 | 3654.0 | 2788.0 | 3830.0 | 4272.0 | 4349.0 | 20816.0 | 2524.0 | 2675.0 | 2550.0 | 4620.0 | 18506.0 | 1999.0 | 5329.0 | 2140.0 | 2568.0 | 4358.0 | 2361.0 |
| 9493 | 2040.0 | 2561.0 | 3913.0 | 3375.0 | 2058.0 | 4722.0 | 19399.0 | 4765.0 | 3796.0 | 2949.0 | 2686.0 | 13008.0 | 4201.0 | 1830.0 | 1534.0 | 2272.0 | 3348.0 | 2992.0 |
| 9494 | 4846.0 | 4883.0 | 4153.0 | 2108.0 | 4164.0 | 1881.0 | 20551.0 | 5030.0 | 2683.0 | 4001.0 | 1450.0 | 24684.0 | 3979.0 | 1187.0 | 2107.0 | 2508.0 | 4581.0 | 4731.0 |
| 9495 | 4310.0 | 3916.0 | 3937.0 | 2488.0 | 3343.0 | 2219.0 | 22914.0 | 5104.0 | 2640.0 | 3864.0 | 1730.0 | 19874.0 | 3654.0 | 499.0 | 1920.0 | 2971.0 | 4476.0 | 4654.0 |
| 9496 | 2698.0 | 3174.0 | 3984.0 | 3541.0 | 2522.0 | 4946.0 | 18512.0 | 5165.0 | 4167.0 | 2704.0 | 2536.0 | 16069.0 | 3851.0 | 1620.0 | 1830.0 | 2084.0 | 3240.0 | 3644.0 |
9497 rows × 18 columns
Do a final check by re-counting the amount of missing data
print(perfume_preference.isnull().sum().sum())
0
4. Review the data visually¶
from pandas.plotting import scatter_matrix
scatter_matrix(perfume_preference, figsize = (12,12));

scatter_matrix(perfume_score, figsize = (12,12));

Correlation matrix:
correlation_matrix = np.absolute(perfume_preference.corr().round(2))
sns.set(rc = {'figure.figsize': (10,10)})
ax = sns.heatmap(correlation_matrix, annot = True, cmap = 'Reds')
bottom,top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
(18.5, -0.5)

From this correlation map, we see that: - Vetiver is strongly correlated with Neroli and Citron - Opopanax is strongly correlated with Bergamot - ...
correlation_matrix = np.absolute(perfume_score.corr().round(2))
sns.set(rc = {'figure.figsize': (10,10)})
ax = sns.heatmap(correlation_matrix, annot = True, cmap = 'Reds')
bottom,top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
(19.5, -0.5)

From this correlation map, we see that: - Scent Quality Score is strongly correlated with Bergamot and Jasmine - Scent Quality Score is also affected by Aliphatic Aldehydes and Vanilla to some extent - Jasmine and Bergamot are correlated with each other. We should not use both Jasmine and Vanillafor building the model considering the 'double counting'their impact on the result - Also Oud and Cashmeran are correlated.
Therefore, we choose Jasmine, Vanilla and Aliphatic Aldhydes for building our first model.
Three key features(Jasmine, Vanilla and Aliphatic Aldehydes) in a 3D plot:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(perfume_score['Jasmine'], perfume_score['Vanilla'], perfume_score['Aliphatic Aldehydes'], c = perfume_score['Scent Quality Score'], cmap = 'gist_heat')
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1a86c1b2908>

5. Polynominal regression¶
5.1 Select and split the data - processing the perfume score data¶
X = pd.DataFrame(np.c_[perfume_score['Jasmine'], perfume_score['Vanilla'], perfume_score['Aliphatic Aldehydes']], columns = ['Jasmine','Vanilla','Aliphatic Aldehydes'])
Y = perfume_score['Scent Quality Score']
X
| Jasmine | Vanilla | Aliphatic Aldehydes | |
|---|---|---|---|
| 0 | 315.377 | 479.027 | 783.264 |
| 1 | 210.582 | 507.113 | 906.204 |
| 2 | 288.379 | 611.463 | 705.294 |
| 3 | 151.414 | 469.357 | 678.681 |
| 4 | 353.518 | 439.266 | 622.090 |
| ... | ... | ... | ... |
| 4999 | 220.188 | 600.726 | 662.834 |
| 5000 | 268.901 | 583.384 | 819.217 |
| 5001 | 204.422 | 539.192 | 741.248 |
| 5002 | 147.307 | 510.981 | 804.591 |
| 5003 | 224.550 | 625.714 | 710.752 |
5004 rows × 3 columns
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
print(X_train[0:10])
Jasmine Vanilla Aliphatic Aldehydes
4427 170.793 570.382 908.473
3248 158.091 416.565 641.899
2286 203.866 600.023 642.780
581 232.697 436.631 720.934
1188 90.112 538.792 814.159
2535 17.924 585.713 618.380
2151 252.522 624.270 621.184
113 332.608 570.309 561.070
3801 80.332 487.969 710.184
3430 362.441 550.813 800.493
5.2 Build the model¶
We first generate the features and take a look at the extended set of feature names:
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree = 3)
X_train_poly = poly_features.fit_transform(X_train)
print(poly_features.get_feature_names(["Jasmine","Vanilla","Aliphatic Aldehydes"]))
['1', 'Jasmine', 'Vanilla', 'Aliphatic Aldehydes', 'Jasmine^2', 'Jasmine Vanilla', 'Jasmine Aliphatic Aldehydes', 'Vanilla^2', 'Vanilla Aliphatic Aldehydes', 'Aliphatic Aldehydes^2', 'Jasmine^3', 'Jasmine^2 Vanilla', 'Jasmine^2 Aliphatic Aldehydes', 'Jasmine Vanilla^2', 'Jasmine Vanilla Aliphatic Aldehydes', 'Jasmine Aliphatic Aldehydes^2', 'Vanilla^3', 'Vanilla^2 Aliphatic Aldehydes', 'Vanilla Aliphatic Aldehydes^2', 'Aliphatic Aldehydes^3']
Actual data:
X_train_poly[0:10]
array([[1.00000000e+00, 1.70793000e+02, 5.70382000e+02, 9.08473000e+02,
2.91702488e+04, 9.74172529e+04, 1.55160829e+05, 3.25335626e+05,
5.18176647e+05, 8.25323192e+05, 4.98207431e+06, 1.66381849e+07,
2.65003835e+07, 5.55650476e+07, 8.85009440e+07, 1.40959424e+08,
1.85565585e+08, 2.95558632e+08, 4.70749493e+08, 7.49783836e+08],
[1.00000000e+00, 1.58091000e+02, 4.16565000e+02, 6.41899000e+02,
2.49927643e+04, 6.58551774e+04, 1.01478455e+05, 1.73526399e+05,
2.67392657e+05, 4.12034326e+05, 3.95113110e+06, 1.04111109e+07,
1.60428304e+07, 2.74329620e+07, 4.22723725e+07, 6.51389187e+07,
7.22850245e+07, 1.11386422e+08, 1.71639079e+08, 2.64484422e+08],
[1.00000000e+00, 2.03866000e+02, 6.00023000e+02, 6.42780000e+02,
4.15613460e+04, 1.22324289e+05, 1.31040987e+05, 3.60027601e+05,
3.85682784e+05, 4.13166128e+05, 8.47294535e+06, 2.49377635e+07,
2.67148020e+07, 7.33973868e+07, 7.86276064e+07, 8.42305259e+07,
2.16024841e+08, 2.31418541e+08, 2.47909180e+08, 2.65574924e+08],
[1.00000000e+00, 2.32697000e+02, 4.36631000e+02, 7.20934000e+02,
5.41478938e+04, 1.01602724e+05, 1.67759179e+05, 1.90646630e+05,
3.14782133e+05, 5.19745832e+05, 1.26000524e+07, 2.36426490e+07,
3.90370577e+07, 4.43628989e+07, 7.32488581e+07, 1.20943296e+08,
8.32422288e+07, 1.37443638e+08, 2.26937143e+08, 3.74702442e+08],
[1.00000000e+00, 9.01120000e+01, 5.38792000e+02, 8.14159000e+02,
8.12017254e+03, 4.85516247e+04, 7.33654958e+04, 2.90296819e+05,
4.38662356e+05, 6.62854877e+05, 7.31724988e+05, 4.37508401e+06,
6.61111156e+06, 2.61592270e+07, 3.95287422e+07, 5.97311787e+07,
1.56409604e+08, 2.36347768e+08, 3.57140905e+08, 5.39669264e+08],
[1.00000000e+00, 1.79240000e+01, 5.85713000e+02, 6.18380000e+02,
3.21269776e+02, 1.04983198e+04, 1.10838431e+04, 3.43059718e+05,
3.62193205e+05, 3.82393824e+05, 5.75843947e+03, 1.88171884e+05,
1.98666804e+05, 6.14900239e+06, 6.49195101e+06, 6.85402691e+06,
2.00934537e+08, 2.12141269e+08, 2.23973034e+08, 2.36464693e+08],
[1.00000000e+00, 2.52522000e+02, 6.24270000e+02, 6.21184000e+02,
6.37673605e+04, 1.57641909e+05, 1.56862626e+05, 3.89713033e+05,
3.87786536e+05, 3.85869562e+05, 1.61026614e+07, 3.98080501e+07,
3.96112641e+07, 9.84111145e+07, 9.79246316e+07, 9.74405535e+07,
2.43286155e+08, 2.42083501e+08, 2.40886791e+08, 2.39695998e+08],
[1.00000000e+00, 3.32608000e+02, 5.70309000e+02, 5.61070000e+02,
1.10628082e+05, 1.89689336e+05, 1.86616371e+05, 3.25252355e+05,
3.19983271e+05, 3.14799545e+05, 3.67957850e+07, 6.30921906e+07,
6.20700978e+07, 1.08181535e+08, 1.06428996e+08, 1.04704847e+08,
1.85494346e+08, 1.82489339e+08, 1.79533014e+08, 1.76624581e+08],
[1.00000000e+00, 8.03320000e+01, 4.87969000e+02, 7.10184000e+02,
6.45323022e+03, 3.91995257e+04, 5.70505011e+04, 2.38113745e+05,
3.46547776e+05, 5.04361314e+05, 5.18400890e+05, 3.14897630e+06,
4.58298085e+06, 1.91281534e+07, 2.78388760e+07, 4.05163531e+07,
1.16192126e+08, 1.69104572e+08, 2.46112686e+08, 3.58189335e+08],
[1.00000000e+00, 3.62441000e+02, 5.50813000e+02, 8.00493000e+02,
1.31363478e+05, 1.99637215e+05, 2.90131483e+05, 3.03394961e+05,
4.40921951e+05, 6.40789043e+05, 4.76115105e+07, 7.23567117e+07,
1.05155545e+08, 1.09962773e+08, 1.59808193e+08, 2.32248222e+08,
1.67113889e+08, 2.42865542e+08, 3.52954935e+08, 5.12947143e+08]])
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
polynomial_model = LinearRegression()
polynomial_model.fit(X_train_poly, Y_train)
print("Model coefficients = ", polynomial_model.coef_)
print("Constant term (bias) = ", polynomial_model.intercept_)
Model coefficients = [ 0.00000000e+00 8.44956564e+03 -7.03926860e+02 3.91781726e+03
-7.27929661e+00 -8.46412648e+00 -1.17297723e+01 1.85161715e+00
9.12487891e+00 -5.89023638e+00 7.03479781e-04 -1.54576830e-03
9.41465525e-03 9.78622737e-03 6.73304083e-02 6.58873295e-03
-5.15228130e-03 6.53665385e-03 -5.71172445e-03 4.06079530e-03]
Constant term (bias) = 878612.6569970567
5.3 Test the model¶
We can apply this model to both the original traning data and to the test data:
y_train_predicted = polynomial_model.predict(X_train_poly)
y_test_predict = polynomial_model.predict(poly_features.fit_transform(X_test))
Then measure the model quality for each case:
rmse_train = np.sqrt(mean_squared_error(Y_train, y_train_predicted))
r2_train = r2_score(Y_train, y_train_predicted)
rmse_test = np.sqrt(mean_squared_error(Y_test,y_test_predict))
r2_test = r2_score(Y_test, y_test_predict)
print("R2:")
print("Train = ", r2_train)
print("Test = ", r2_test)
print("RMSE:")
print("Train = ", rmse_train)
print("Test = ", rmse_test)
R2:
Train = 0.9791796776657181
Test = 0.9814345530715797
RMSE:
Train = 412795.4591276655
Test = 393860.84246070404
Then we get the results of two regression evaluation indicators - mean square error root (RMSE) and R squared (R2).
6. Clustering Customers¶
6.1 Standardize the data into a standard size¶
perfume_preference[0:10]
| Narcissus | Agrumen | Oud | Jasmine | Amber | Neroli | Indole | Vanilla | Frankincense | Bergamot | Galbanum | Magnolia | Sandalwood | Cashmeran | Citron | Opopanax | Aliphatic Aldehydes | Vetiver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1353.0 | 1252.0 | 4066.0 | 3838.0 | 2144.0 | 4404.0 | 32082.0 | 3866.0 | 2505.0 | 3972.0 | 4485.0 | 6441.0 | 4106.0 | 1722.0 | 4287.0 | 4820.0 | 4140.0 | 1463.0 |
| 1 | 1089.0 | 2152.0 | 4045.0 | 3710.0 | 2235.0 | 4352.0 | 30398.0 | 4769.0 | 2995.0 | 4720.0 | 4532.0 | 10931.0 | 3794.0 | 1638.0 | 4648.0 | 4472.0 | 4184.0 | 1071.0 |
| 2 | 4177.0 | 3592.0 | 3596.0 | 1745.0 | 3234.0 | 2116.0 | 21678.0 | 4864.0 | 3178.0 | 3381.0 | 1376.0 | 18153.0 | 2502.0 | 1733.0 | 1747.0 | 2728.0 | 4580.0 | 4742.0 |
| 3 | 4899.0 | 3738.0 | 2454.0 | 3976.0 | 4945.0 | 3853.0 | 17963.0 | 3040.0 | 2943.0 | 2870.0 | 4016.0 | 18819.0 | 1990.0 | 5118.0 | 2391.0 | 2012.0 | 3470.0 | 3057.0 |
| 4 | 4822.0 | 4030.0 | 3447.0 | 4225.0 | 4078.0 | 3772.0 | 23988.0 | 3389.0 | 2415.0 | 2695.0 | 3887.0 | 20367.0 | 2118.0 | 4530.0 | 2427.0 | 3205.0 | 4319.0 | 2289.0 |
| 5 | 2251.0 | 2305.0 | 4058.0 | 3330.0 | 1775.0 | 4882.0 | 16567.0 | 5148.0 | 4443.0 | 2472.0 | 2615.0 | 11655.0 | 3061.0 | 1549.0 | 1563.0 | 1709.0 | 3426.0 | 3003.0 |
| 6 | 1661.0 | 2199.0 | 4994.0 | 2795.0 | 2231.0 | 4108.0 | 31511.0 | 3584.0 | 2771.0 | 4153.0 | 4462.0 | 11061.0 | 3791.0 | 2123.0 | 4528.0 | 4716.0 | 4124.0 | 2016.0 |
| 7 | 4690.0 | 3674.0 | 3827.0 | 2130.0 | 3483.0 | 2544.0 | 21010.0 | 4284.0 | 2457.0 | 3610.0 | 1819.0 | 18601.0 | 3917.0 | 2129.0 | 1609.0 | 2614.0 | 3879.0 | 3962.0 |
| 8 | 4735.0 | 3236.0 | 3255.0 | 3349.0 | 4221.0 | 4038.0 | 22356.0 | 3202.0 | 2804.0 | 2754.0 | 2968.0 | 16407.0 | 1899.0 | 4813.0 | 2878.0 | 2862.0 | 3125.0 | 2692.0 |
| 9 | 1259.0 | 2679.0 | 3541.0 | 3159.0 | 1937.0 | 4619.0 | 31967.0 | 4662.0 | 2797.0 | 3840.0 | 5327.0 | 13610.0 | 3317.0 | 1889.0 | 4457.0 | 4795.0 | 4390.0 | 1690.0 |
standardized_customer_data = preprocessing.scale(perfume_preference)
standardized_customer_data_df = pd.DataFrame(standardized_customer_data,columns = perfume_preference.columns)
6.2 Decide the number of clusters - Elbow method¶
import matplotlib.pyplot as plt
sse = []
for k in range(1,11):
kmeans = KMeans(n_clusters = k,)
kmeans.fit(standardized_customer_data_df)
sse.append(kmeans.inertia_)
x = range(1,11)
plt.xlabel('K')
plt.ylabel('SSE')
plt.plot(x,sse,'o-')
plt.show()

The 'elbow' value on the graph indicates the optimum number of clusters. The number of clusters here is 4.
6.3 Cluster the data (build the model)¶
First we create the object('machine') that we will use to build the model.
kmeans = KMeans(n_clusters = 4)
Then we use that object to identify clusters in the data.
kmeans.fit(standardized_customer_data_df)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
random_state=None, tol=0.0001, verbose=0)
y_km = kmeans.fit_predict(standardized_customer_data_df)
6.4 Reviw the Results¶
We want to know: for each group, how tightly clustered they are. That is, identify variances for each these groups.
km0_var = np.var(y_km ==0)
km1_var = np.var(y_km ==1)
km2_var = np.var(y_km ==2)
km3_var = np.var(y_km ==3)
print("Variance of group 0 is %f" % km0_var)
print("Variance of group 1 is %f" % km1_var)
print("Variance of group 2 is %f" % km2_var)
print("Variance of group 3 is %f" % km3_var)
Variance of group 0 is 0.188117
Variance of group 1 is 0.187171
Variance of group 2 is 0.190040
Variance of group 3 is 0.184611
Group 1 is the tightest, whereas group 2 is rather loose.
Specifically, for any customer group identified indicate the spread (variance) for each group for each dimension (quantity of scent chemical):
km0_che_var = np.var(perfume_preference[y_km == 0], axis=0)
print("Variance for each dimension of group 0 is: ")
km0_che_var
Variance for each dimension of group 0 is:
Narcissus 8.070874e+04
Agrumen 1.860085e+05
Oud 1.247092e+05
Jasmine 1.376372e+05
Amber 1.754141e+05
Neroli 9.981178e+04
Indole 1.814991e+06
Vanilla 1.461047e+05
Frankincense 4.453063e+04
Bergamot 1.587021e+05
Galbanum 5.220495e+04
Magnolia 4.646825e+06
Sandalwood 8.379077e+04
Cashmeran 8.081125e+04
Citron 1.889976e+05
Opopanax 7.244132e+04
Aliphatic Aldehydes 1.835542e+05
Vetiver 1.363927e+05
dtype: float64
We can see that they are similar with regard to some chemicals but have a wide range of responses to other scent chemicals.
The other three groups did much the same:
km1_che_var = np.var(perfume_preference[y_km == 1], axis=0)
print("Variance for each dimension of group 1 is: ")
km1_che_var
Variance for each dimension of group 1 is:
Narcissus 1.041274e+05
Agrumen 6.039455e+04
Oud 1.338378e+05
Jasmine 1.140873e+05
Amber 1.361587e+05
Neroli 1.831613e+05
Indole 2.477114e+06
Vanilla 9.723437e+04
Frankincense 4.640931e+04
Bergamot 5.826429e+04
Galbanum 2.036101e+05
Magnolia 1.511061e+06
Sandalwood 4.278111e+04
Cashmeran 1.827669e+05
Citron 8.646526e+04
Opopanax 9.921890e+04
Aliphatic Aldehydes 1.151117e+05
Vetiver 1.233736e+05
dtype: float64
km2_che_var = np.var(perfume_preference[y_km == 2], axis=0)
print("Variance for each dimension of group 2 is: ")
km2_che_var
Variance for each dimension of group 2 is:
Narcissus 9.723301e+04
Agrumen 1.944770e+05
Oud 1.962811e+05
Jasmine 1.289577e+05
Amber 6.918558e+04
Neroli 4.724214e+04
Indole 1.778136e+06
Vanilla 1.525133e+05
Frankincense 1.786047e+05
Bergamot 7.893402e+04
Galbanum 1.276341e+05
Magnolia 4.858489e+06
Sandalwood 6.911784e+04
Cashmeran 6.365879e+04
Citron 4.415088e+04
Opopanax 7.111018e+04
Aliphatic Aldehydes 9.197844e+04
Vetiver 1.581174e+05
dtype: float64
km3_che_var = np.var(perfume_preference[y_km == 3], axis=0)
print("Variance for each dimension of group 3 is: ")
km3_che_var
Variance for each dimension of group 3 is:
Narcissus 1.219228e+05
Agrumen 9.138574e+04
Oud 1.677518e+05
Jasmine 5.476186e+04
Amber 5.526990e+04
Neroli 1.466144e+05
Indole 3.613530e+06
Vanilla 1.238448e+05
Frankincense 1.504654e+05
Bergamot 4.178902e+04
Galbanum 7.548093e+04
Magnolia 2.287027e+06
Sandalwood 1.405492e+05
Cashmeran 1.905020e+05
Citron 1.384131e+05
Opopanax 1.444139e+05
Aliphatic Aldehydes 8.179831e+04
Vetiver 1.320200e+05
dtype: float64
print(y_km[0:20])
[2 2 3 1 1 0 2 3 1 2 3 2 2 2 3 1 0 0 1 2]
print(perfume_preference[y_km == 2][0:5])
Narcissus Agrumen Oud Jasmine Amber Neroli Indole Vanilla \
0 1353.0 1252.0 4066.0 3838.0 2144.0 4404.0 32082.0 3866.0
1 1089.0 2152.0 4045.0 3710.0 2235.0 4352.0 30398.0 4769.0
6 1661.0 2199.0 4994.0 2795.0 2231.0 4108.0 31511.0 3584.0
9 1259.0 2679.0 3541.0 3159.0 1937.0 4619.0 31967.0 4662.0
11 1683.0 2078.0 3989.0 2873.0 2038.0 4309.0 30393.0 3648.0
Frankincense Bergamot Galbanum Magnolia Sandalwood Cashmeran Citron \
0 2505.0 3972.0 4485.0 6441.0 4106.0 1722.0 4287.0
1 2995.0 4720.0 4532.0 10931.0 3794.0 1638.0 4648.0
6 2771.0 4153.0 4462.0 11061.0 3791.0 2123.0 4528.0
9 2797.0 3840.0 5327.0 13610.0 3317.0 1889.0 4457.0
11 3011.0 4418.0 4092.0 10568.0 3768.0 1746.0 4483.0
Opopanax Aliphatic Aldehydes Vetiver
0 4820.0 4140.0 1463.0
1 4472.0 4184.0 1071.0
6 4716.0 4124.0 2016.0
9 4795.0 4390.0 1690.0
11 4495.0 4602.0 2307.0
We can look at the clusters in chart form.
#Jasmine vs Vanilla
plt.figure(figsize = (7,7))
plt.scatter(perfume_preference[y_km ==0]['Jasmine'], perfume_preference[y_km == 0]['Vanilla'],
s = 15,c = 'red',alpha = .5)
plt.scatter(perfume_preference[y_km ==1]['Jasmine'], perfume_preference[y_km == 1]['Vanilla'],
s = 15,c = 'black',alpha = .5)
plt.scatter(perfume_preference[y_km ==2]['Jasmine'], perfume_preference[y_km == 2]['Vanilla'],
s = 15,c = 'blue',alpha = .5)
plt.scatter(perfume_preference[y_km ==3]['Jasmine'], perfume_preference[y_km == 3]['Vanilla'],
s = 15,c = 'cyan',alpha = .5)
<matplotlib.collections.PathCollection at 0x1a86c2add48>

Take a '3D' view of the data
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(111,projection = '3d')
ax.view_init(20,20)
ax.set_xlabel('Jasmine')
ax.set_ylabel('Vanilla')
ax.set_zlabel('Aliphatic Aldehydes')
ax.scatter(perfume_preference[y_km ==0]['Jasmine'], perfume_preference[y_km == 0]['Vanilla'],
perfume_preference[y_km ==0]['Aliphatic Aldehydes'], s = 15,c = 'red',alpha = .3)
ax.scatter(perfume_preference[y_km ==1]['Jasmine'], perfume_preference[y_km == 1]['Vanilla'],
perfume_preference[y_km ==1]['Aliphatic Aldehydes'], s = 15,c = 'black',alpha = .3)
ax.scatter(perfume_preference[y_km ==2]['Jasmine'], perfume_preference[y_km == 2]['Vanilla'],
perfume_preference[y_km ==2]['Aliphatic Aldehydes'], s = 15,c = 'blue',alpha = .3)
ax.scatter(perfume_preference[y_km ==3]['Jasmine'], perfume_preference[y_km == 3]['Vanilla'],
perfume_preference[y_km ==3]['Aliphatic Aldehydes'], s = 15,c = 'cyan',alpha = .3)
Using matplotlib backend: Qt5Agg
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1a872c3bc88>

The central point of K-means:
centers = kmeans.cluster_centers_
centers
array([[-0.63508675, -0.41641312, 0.1801924 , 0.10214018, -0.71073531,
0.92559669, -0.91458644, 0.69966415, 1.49956699, -0.59666119,
-0.60720389, -0.41660319, 0.57117372, -0.48341316, -0.74077772,
-0.91436421, -0.42815036, 0.10413989],
[ 1.01512708, 0.75401605, -1.09476607, 1.04776978, 1.37499078,
0.14680867, -0.54262287, -1.52078422, -0.61896137, -1.13774394,
0.62329158, 0.75422437, -1.61128735, 1.66687438, -0.18799296,
-0.54276709, -0.53338134, -0.30061661],
[-1.2345529 , -1.27841445, 0.84571864, 0.21695373, -1.00251791,
0.4677676 , 1.5634698 , 0.12346678, -0.43796367, 1.19821614,
1.18028798, -1.27839257, 0.56702368, -0.50325856, 1.59486712,
1.56359936, 0.82022759, -1.19924176],
[ 0.90638695, 0.99381461, 0.04883221, -1.40107939, 0.37454187,
-1.59030928, -0.13842807, 0.70373206, -0.45305188, 0.52350583,
-1.24439817, 0.99377463, 0.46500412, -0.67859451, -0.71193968,
-0.13864473, 0.12810446, 1.45221773]])
We can visualize these arrays:
from matplotlib.pyplot import figure
figure(figsize = (20, 10))
x = np.array(range(0, 18))
y = np.array([[ 0.90638695, 0.99381461, 0.04883221, -1.40107939, 0.37454187,
-1.59030928, -0.13842807, 0.70373206, -0.45305188, 0.52350583,
-1.24439817, 0.99377463, 0.46500412, -0.67859451, -0.71193968,
-0.13864473, 0.12810446, 1.45221773],
[-1.2345529 , -1.27841445, 0.84571864, 0.21695373, -1.00251791,
0.4677676 , 1.5634698 , 0.12346678, -0.43796367, 1.19821614,
1.18028798, -1.27839257, 0.56702368, -0.50325856, 1.59486712,
1.56359936, 0.82022759, -1.19924176],
[ 1.01512708, 0.75401605, -1.09476607, 1.04776978, 1.37499078,
0.14680867, -0.54262287, -1.52078422, -0.61896137, -1.13774394,
0.62329158, 0.75422437, -1.61128735, 1.66687438, -0.18799296,
-0.54276709, -0.53338134, -0.30061661],
[-0.63508675, -0.41641312, 0.1801924 , 0.10214018, -0.71073531,
0.92559669, -0.91458644, 0.69966415, 1.49956699, -0.59666119,
-0.60720389, -0.41660319, 0.57117372, -0.48341316, -0.74077772,
-0.91436421, -0.42815036, 0.10413989]])
plt.title("Plotting Central Points")
plt.xlabel("Features")
plt.ylabel("Preference")
for i, array in enumerate(y):
plt.scatter(x, array, s = (150, ), color = np.random.rand(3, ), marker = "o", label = f"Array #{i}")
plt.legend(loc = "center left", bbox_to_anchor=(1, 0.5))
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17],[r'$Narcissus$', r'$Agrumen$', r'$Oud$', r'$Jasmine$', r'$Amber$',
r'$Neroli$', r'$Indole$', r'$Vanilla$', r'$Frankincen$', r'$Bergamot$',
r'$Galbanum$', r'$Magnolia$', r'$Sandalwood$', r'$Cashmeran$',
r'$Citron$', r'$Opopanax$', r'$Aliphatic Aldehydes$', r'$Vetiver$'])
plt.show()

We define that if the coefficient is positive, then the feature is preferred by group customers.
Best mixture for each group:
- Group 0: Narcissus, Agrumen, Oud, Amber, Vanilla, Bergamot, Magnolia, Sandalwood, Aliphatic Aldehydes, Vetiver
- Group 1: Oud, Jasmine, Neroli, Indole, Vanilla, Bergamot, Galbanum, Sandalwood, Citron, Opopanax, Aliphatic Aldehydes
- Group 2: Narcissus, Agrumen, Jasmine, Amber, Neroli, Galbanum, Magnolia, Cashmeran
- Group 3: Oud, Jasmine, Neroli, Vanilla, Frankincen, Sandalwood, Vetiver