import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime, nltk, warnings
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA,KernelPCA
from sklearn.cluster import KMeans

plt.style.use('seaborn-whitegrid')
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('Wholesale customers data.csv')
df


df = df.rename(columns={'Delicassen': 'Delicatessen'})
df['Channel'] = df['Channel'].map({1:'HoReCa', 2: 'Retail'})
df['Region'] = df['Region'].map({1:'Lisbon', 2: 'Porto',3:'Other'})
df.columns

Index(['Channel', 'Region', 'Fresh', 'Milk', 'Grocery', 'Frozen',
       'Detergents_Paper', 'Delicatessen'],
      dtype='object')


print(df.info())
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel           440 non-null    object
 1   Region            440 non-null    object
 2   Fresh             440 non-null    int64 
 3   Milk              440 non-null    int64 
 4   Grocery           440 non-null    int64 
 5   Frozen            440 non-null    int64 
 6   Detergents_Paper  440 non-null    int64 
 7   Delicatessen      440 non-null    int64 
dtypes: int64(6), object(2)
memory usage: 27.6+ KB
None


df.to_csv("wholesale-processed.csv")


continuous = ['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicatessen']
categorical = ['Channel','Region']
print(str(continuous) + '\n length: ' + str(len(continuous)))
print(str(categorical) + '\n length: ' + str(len(categorical)))

['Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicatessen']
 length: 6
['Channel', 'Region']
 length: 2


f,ax=plt.subplots(1,2,figsize=(15,7))
df['Channel'].value_counts().plot.bar(ax=ax[0])
ax[0].set_title('Number Of Clients By Channel')
ax[0].set_ylabel('Count')
df['Region'].value_counts().plot.bar(ax=ax[1])
ax[1].set_title('Number Of Clients By Region')
ax[1].set_ylabel('Count')
plt.show()


count = df.groupby(["Region","Channel"])["Channel"].count().unstack()
count.style.background_gradient(cmap='viridis')


sns.countplot(x='Region',hue='Channel',data=df)

<AxesSubplot:xlabel='Region', ylabel='count'>


f,ax = plt.subplots(2,3,figsize=(15,10))

for i,fea in enumerate(continuous):
    r,c = i%2,i%3
    sns.histplot(df[fea],ax=ax[r,c],kde=True)
    ax[r,c].set_title('{}'.format(fea))
    
plt.tight_layout()
plt.show()


melted = df[continuous].melt(var_name='Types of Products', value_name='Annual Spending')
plt.figure(figsize=(18, 10))
ax = sns.violinplot(x='Types of Products', y='Annual Spending', data=melted)
# _ = ax.set_xticklabels(df[continuous].keys(), rotation=90)


sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x2486dc01dc0>


sns.pairplot(df,hue='Channel')

<seaborn.axisgrid.PairGrid at 0x24870527460>


sns.pairplot(df,hue='Region')

<seaborn.axisgrid.PairGrid at 0x24871abaf40>


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# scaling
df_scaled = df[continuous].to_numpy()
df.shape

(440, 8)


scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_scaled)
df_scaled

array([[ 0.05293319,  0.52356777, -0.04111489, -0.58936716, -0.04356873,
        -0.06633906],
       [-0.39130197,  0.54445767,  0.17031835, -0.27013618,  0.08640684,
         0.08915105],
       [-0.44702926,  0.40853771, -0.0281571 , -0.13753572,  0.13323164,
         2.24329255],
       ...,
       [ 0.20032554,  1.31467078,  2.34838631, -0.54337975,  2.51121768,
         0.12145607],
       [-0.13538389, -0.51753572, -0.60251388, -0.41944059, -0.56977032,
         0.21304614],
       [-0.72930698, -0.5559243 , -0.57322717, -0.62009417, -0.50488752,
        -0.52286938]])


pca = PCA(n_components = 6)
pca.fit(df_scaled)
plt.plot(np.cumsum(pca.explained_variance_ratio_*100))

[<matplotlib.lines.Line2D at 0x24874629670>]


pca = PCA(n_components = 4)
res_pca = pca.fit_transform(df_scaled)
plt.plot(np.cumsum(pca.explained_variance_ratio_*100))

[<matplotlib.lines.Line2D at 0x248745e7880>]


fig,axes = plt.subplots(2,3,figsize = (12,8))
const = 20 # make the dots more visible

for fea,ax in zip(continuous,axes.ravel()):
    sizes = const + const * df_scaled[:,continuous.index(fea)] # magnitude in scaled data
    colors = df[fea] # magnitude in original data 
    ax.scatter(res_pca[:,0], # x axis is 1st component
               res_pca[:,1], # y axis is 2nd component
               alpha=0.7,s=sizes, c=colors, cmap = 'viridis')
    ax.set_title(fea)
    
plt.tight_layout()


fig,axes = plt.subplots(2,1, figsize = (10,8))
const = 200

for feature,ax in zip(categorical,axes.ravel()):
    for unique_val in df[feature].unique(): # unique_val is unique values in the categorical features 
        sns.scatterplot(x=res_pca[df[feature] == unique_val,0],
                        y=res_pca[df[feature] == unique_val,1],
                       alpha= 0.6,label=unique_val,s=const, ax=ax)
        ax.set_title(feature)
        
plt.tight_layout()


for feature,ax in zip(categorical,axes.ravel()):
    sizes = const
    temp = pd.concat([pd.DataFrame(res_pca,columns= ['First component','Second Component',
                                                     'Third Component','Fourth Component']),
                        df[feature]], axis=1)
    fig = px.scatter_3d(temp, x='First component',
                        y='Second Component',
                        z='Third Component',
                       color = temp[feature],opacity=0.8)
    fig.show()


kpca = KernelPCA(n_components=4,kernel='rbf')
res_kpca = kpca.fit_transform(df_scaled)


fig,axes = plt.subplots(2,3,figsize=(12,6))
const = 40

for feature,ax in zip(continuous,axes.ravel()):
    sizes = const + const * df_scaled[:,continuous.index(feature)]
    ax.scatter(res_kpca[:,0],res_kpca[:,1],
               s=sizes,alpha=0.3, c = df[feature],cmap='viridis')
    
plt.tight_layout()


fig,axes = plt.subplots(2,1, figsize = (10,8))
const = 200

for feature,ax in zip(categorical,axes.ravel()):
    for unique_val in df[feature].unique():
        sns.scatterplot(x=res_kpca[df[feature] == unique_val,0],
                        y=res_kpca[df[feature] == unique_val,1],
                       alpha=0.6,label=unique_val,s=const, ax=ax)
        ax.set_title(feature)
        
plt.tight_layout()


import plotly.express as px

for feature,ax in zip(categorical,axes.ravel()):
    sizes = 200
    temp = pd.concat([pd.DataFrame(res_kpca,columns= ['First component','Second Component',
                                                     'Third Component','Fourth Component']),
                        df[feature]], axis=1)
    fig = px.scatter_3d(temp, x='First component',
                        y='Second Component',
                        z='Third Component',
                       color = temp[feature], opacity=0.7)
    fig.show()


df_latent = df.copy()
df_latent['kpca_1'],df_latent['kpca_2'] = res_kpca[:,0],res_kpca[:,1]
df_latent['kpca_3'],df_latent['kpca_4'] = res_kpca[:,2],res_kpca[:,3]
df.to_csv('wholesale-latent.csv')


from yellowbrick.cluster import KElbowVisualizer # Instantiate the clustering model and visualizer

model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,12))

visualizer.fit(res_kpca)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>


clusterer = KMeans(n_clusters=5)
clusters = clusterer.fit_predict(res_kpca)
clusters = pd.Series(clusters,name='Cluster_No')
clusters

0      3
1      3
2      1
3      0
4      4
      ..
435    1
436    4
437    1
438    2
439    2
Name: Cluster_No, Length: 440, dtype: int32


combined = pd.concat([df_latent,clusters],axis=1)
combined


fig = px.scatter_3d(combined, x='Fresh',y='Milk',z='Grocery',
                   symbol='Channel',color = 'Cluster_No')

# specify trace names and symbols in a dict
symbols = {'Retail': 'cross',
           'HoReCa':'circle-open'}

# set all symbols in fig
for i, d in enumerate(fig.data):
    fig.data[i].marker.symbol = symbols[fig.data[i].name]

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
fig.show()


fig = px.scatter_3d(combined, x='Frozen',y='Detergents_Paper',z='Delicatessen',
                   opacity=0.6,symbol='Channel',color = 'Cluster_No')
# specify trace names and symbols in a dict
symbols = {'Retail': 'cross',
           'HoReCa':'circle-open'}

# set all symbols in fig
for i, d in enumerate(fig.data):
    fig.data[i].marker.symbol = symbols[fig.data[i].name]

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.show()


combined.groupby(['Cluster_No'])[continuous].mean().style.background_gradient(cmap='viridis')


combined.groupby(['Cluster_No','Channel','Region'])[continuous].mean().style.background_gradient(cmap='viridis')


from sklearn.preprocessing import MinMaxScaler

# scaling
scaler = MinMaxScaler()
cont_normed = df[continuous].to_numpy()
cont_normed = scaler.fit_transform(cont_normed)

df_normalized = combined.copy(deep=True)
df_normalized[continuous] = cont_normed
# df_normalized[continuous] /= df_normalized[continuous].max()
biggest_cluster = df_normalized.groupby(['Cluster_No']).count().max().max()

df_normalized


fig = go.Figure()

for cluster in np.unique(clusters):
    
    radii = df_normalized.loc[df_normalized.Cluster_No==cluster, continuous].values[0].tolist()
    thetas = continuous
    
    actual_values = df.loc[df_normalized.Cluster_No==cluster, continuous].values[0].tolist()
    cluster_size = len(df[df_normalized.Cluster_No==cluster])
    fig.add_trace(
        go.Scatterpolar(
            r=radii + radii[:1],
            theta=thetas + thetas[:1],
            mode='lines',
            name=f'Cluster {cluster}',
            text = [f'Mean value: {x}' for x in actual_values + actual_values[:1]],
            line=dict(width=3),
            opacity=np.max([cluster_size/biggest_cluster, 0.6])
        )
    )

fig.update_layout(
    title='Cluster Analysis',
    showlegend=True,
    width=800
)

fig.show()

	Channel	Region	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicassen
0	2	3	12669	9656	7561	214	2674	1338
1	2	3	7057	9810	9568	1762	3293	1776
2	2	3	6353	8808	7684	2405	3516	7844
3	1	3	13265	1196	4221	6404	507	1788
4	2	3	22615	5410	7198	3915	1777	5185
...	...	...	...	...	...	...	...	...
435	1	3	29703	12051	16027	13135	182	2204
436	1	3	39228	1431	764	4510	93	2346
437	2	3	14531	15488	30243	437	14841	1867
438	1	3	10290	1981	2232	1038	168	2125
439	1	3	2787	1698	2510	65	477	52

	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicatessen
count	440.000000	440.000000	440.000000	440.000000	440.000000	440.000000
mean	12000.297727	5796.265909	7951.277273	3071.931818	2881.493182	1524.870455
std	12647.328865	7380.377175	9503.162829	4854.673333	4767.854448	2820.105937
min	3.000000	55.000000	3.000000	25.000000	3.000000	3.000000
25%	3127.750000	1533.000000	2153.000000	742.250000	256.750000	408.250000
50%	8504.000000	3627.000000	4755.500000	1526.000000	816.500000	965.500000
75%	16933.750000	7190.250000	10655.750000	3554.250000	3922.000000	1820.250000
max	112151.000000	73498.000000	92780.000000	60869.000000	40827.000000	47943.000000

	Channel	Region	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicatessen	kpca_1	kpca_2	kpca_3	kpca_4	Cluster_No
0	Retail	Other	12669	9656	7561	214	2674	1338	0.078374	-0.173115	-0.338646	0.125439	3
1	Retail	Other	7057	9810	9568	1762	3293	1776	0.185813	-0.307415	-0.189365	0.244399	3
2	Retail	Other	6353	8808	7684	2405	3516	7844	0.341702	0.075048	0.065402	0.055788	1
3	HoReCa	Other	13265	1196	4221	6404	507	1788	-0.253304	0.250352	0.159350	0.322670	0
4	Retail	Other	22615	5410	7198	3915	1777	5185	0.136617	0.375279	-0.206154	0.208622	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...
435	HoReCa	Other	29703	12051	16027	13135	182	2204	0.376162	0.466779	0.244987	0.119531	1
436	HoReCa	Other	39228	1431	764	4510	93	2346	0.130231	0.745111	-0.197829	-0.150616	4
437	Retail	Other	14531	15488	30243	437	14841	1867	0.720747	0.122400	0.271172	-0.372950	1
438	HoReCa	Other	10290	1981	2232	1038	168	2125	-0.352004	-0.033909	-0.078534	-0.070619	2
439	HoReCa	Other	2787	1698	2510	65	477	52	-0.343427	-0.268287	0.101947	-0.254443	2

	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicatessen
Cluster_No
0	13163.200000	3038.666667	3358.733333	9204.200000	531.488889	1374.000000
1	18712.679245	18137.773585	23998.433962	6728.754717	10313.056604	4159.566038
2	6503.741176	2395.894118	3077.170588	1568.141176	736.782353	773.970588
3	5259.077778	8149.744444	12568.166667	1392.788889	5321.500000	1416.777778
4	25817.829268	3799.219512	5137.170732	2303.682927	1136.097561	1580.134146

			Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicatessen
Cluster_No	Channel	Region
0	HoReCa	Lisbon	9719.000000	2366.833333	3383.666667	7767.333333	410.833333	1031.666667
		Other	14525.741935	3345.838710	3000.387097	9219.354839	576.193548	1398.645161
		Porto	7187.600000	1313.800000	4151.600000	10754.600000	307.800000	1202.200000
	Retail	Lisbon	11072.000000	5989.000000	5615.000000	8321.000000	955.000000	2137.000000
		Other	19899.000000	5332.000000	8713.000000	8132.000000	764.000000	648.000000
		Porto	16823.000000	928.000000	2743.000000	11559.000000	332.000000	3486.000000
1	HoReCa	Lisbon	30872.000000	11766.333333	6906.666667	11762.666667	774.333333	3277.666667
		Other	42419.538462	12627.769231	10582.615385	14707.615385	1205.384615	8410.692308
		Porto	32717.000000	16784.000000	13626.000000	60869.000000	1272.000000	5609.000000
	Retail	Lisbon	4999.000000	17162.428571	28280.571429	2995.714286	13884.428571	1797.714286
		Other	10749.086957	23832.043478	31858.217391	1845.608696	14533.521739	3295.304348
		Porto	5460.500000	12797.333333	28215.666667	974.833333	15977.500000	1216.666667
2	HoReCa	Lisbon	6806.096774	2710.225806	2822.548387	1683.000000	868.354839	639.451613
		Other	6131.866071	2309.919643	2682.705357	1497.080357	559.366071	840.821429
		Porto	7621.250000	1646.062500	3966.500000	2093.312500	509.000000	741.062500
	Retail	Lisbon	2790.000000	2527.000000	5265.000000	5612.000000	788.000000	1360.000000
		Other	8433.750000	3826.750000	6776.000000	720.125000	2604.125000	447.250000
		Porto	7839.000000	2548.000000	6110.000000	936.000000	2960.000000	392.500000
3	HoReCa	Lisbon	5620.000000	10123.400000	9722.000000	1732.600000	3729.200000	1815.400000
	HoReCa	Other	5088.500000	6154.083333	10316.500000	1559.916667	3549.166667	1587.916667
	Retail	Lisbon	2995.375000	7442.250000	14304.875000	1231.500000	5819.125000	2007.625000
		Other	5797.894737	8258.175439	12903.421053	1409.035088	5618.263158	1324.403509
		Porto	3714.000000	9844.625000	13599.125000	975.250000	6363.125000	978.250000
4	HoReCa	Lisbon	26515.285714	3157.714286	4315.071429	2984.571429	409.142857	1836.357143
		Other	27411.302326	3146.767442	3843.116279	2516.627907	633.000000	1265.906977
		Porto	22603.333333	2471.500000	4204.333333	2121.000000	426.833333	1248.000000
	Retail	Lisbon	20782.000000	5921.000000	9212.000000	1759.000000	2568.000000	1553.000000
		Other	22951.875000	6137.937500	8999.687500	1389.312500	3019.437500	2265.187500
		Porto	21765.000000	6530.000000	8575.000000	1094.500000	3386.500000	2072.000000

Customer Data Analysis and Clustering¶

Table of Contents¶

Data Cleaning ¶

Exploratory Data Analysis ¶

Principle Component Analysis (PCA) ¶

Kernel Principal Component Analysis (KPCA) ¶

Clustering ¶

	Channel	Region	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicatessen	kpca_1	kpca_2	kpca_3	kpca_4	Cluster_No
0	Retail	Other	0.112940	0.130727	0.081464	0.003106	0.065427	0.027847	0.078374	-0.173115	-0.338646	0.125439	3
1	Retail	Other	0.062899	0.132824	0.103097	0.028548	0.080590	0.036984	0.185813	-0.307415	-0.189365	0.244399	3
2	Retail	Other	0.056622	0.119181	0.082790	0.039116	0.086052	0.163559	0.341702	0.075048	0.065402	0.055788	1
3	HoReCa	Other	0.118254	0.015536	0.045464	0.104842	0.012346	0.037234	-0.253304	0.250352	0.159350	0.322670	0
4	Retail	Other	0.201626	0.072914	0.077552	0.063934	0.043455	0.108093	0.136617	0.375279	-0.206154	0.208622	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...
435	HoReCa	Other	0.264829	0.163338	0.172715	0.215469	0.004385	0.045912	0.376162	0.466779	0.244987	0.119531	1
436	HoReCa	Other	0.349761	0.018736	0.008202	0.073713	0.002205	0.048874	0.130231	0.745111	-0.197829	-0.150616	4
437	Retail	Other	0.129543	0.210136	0.325943	0.006771	0.363463	0.038882	0.720747	0.122400	0.271172	-0.372950	1
438	HoReCa	Other	0.091727	0.026224	0.024025	0.016649	0.004042	0.044264	-0.352004	-0.033909	-0.078534	-0.070619	2
439	HoReCa	Other	0.024824	0.022371	0.027022	0.000657	0.011611	0.001022	-0.343427	-0.268287	0.101947	-0.254443	2