 # Clustering or classifying data into groups

how can we cluster/classify data into different categories in visualized form as such if we make change in data, the change should be replicated in the visual also.

Like this?

``````from sklearn.cluster import KMeans
import numpy as np
import streamlit as st
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms

def confidence_ellipse(x, y, ax, n_std=3.0, facecolor="none", **kwargs):
if x.size != y.size:
raise ValueError("x and y must be the same size")
cov = np.cov(x, y)
pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1])
ellipse = Ellipse(
(0, 0),
facecolor=facecolor,
**kwargs
)
scale_x = np.sqrt(cov[0, 0]) * n_std
mean_x = np.mean(x)
scale_y = np.sqrt(cov[1, 1]) * n_std
mean_y = np.mean(y)
transf = (
transforms.Affine2D()
.rotate_deg(45)
.scale(scale_x, scale_y)
.translate(mean_x, mean_y)
)
ellipse.set_transform(transf + ax.transData)

@st.cache
def data():
X = np.random.normal(0, 1, 1000).reshape(-1, 2)
return X

X = data()

cluster_slider = st.slider(
min_value=1, max_value=6, value=2, label="Number of clusters: "
)
kmeans = KMeans(n_clusters=cluster_slider, random_state=0).fit(X)
labels = kmeans.labels_

selectbox = st.selectbox("Visualize confidence bounds", [False, True])
stdbox = st.selectbox("Number of standard deviations: ", [1, 2, 3])

clrs = ["red", "seagreen", "orange", "blue", "yellow", "purple"]

n_labels = len(set(labels))

individual = st.selectbox("Individual subplots?", [False, True])

if individual:
fig, ax = plt.subplots(ncols=n_labels)
else:
fig, ax = plt.subplots()

for i, yi in enumerate(set(labels)):
if not individual:
a = ax
else:
a = ax[i]

xi = X[labels == yi]
x_pts = xi[:, 0]
y_pts = xi[:, 1]
a.scatter(x_pts, y_pts, c=clrs[yi])

if selectbox:
confidence_ellipse(
x=x_pts,
y=y_pts,
ax=a,
edgecolor="black",
facecolor=clrs[yi],
alpha=0.2,
n_std=stdbox,
)
plt.tight_layout()
st.write(fig)

``````

Updated with individual plots:

1 Like

yeah somewhat like this only. thank you.
but can we make it more categorical visually.
for example- categorizing the data into 4 blocks and we can check the homogeneous data in each category/block?

1 Like

Updated. Is this what you meant?

Obviously you’ll have to think of a dynamic subplot layout that fits your data best, but you get the point.

1 Like

yeah, i got it. thanks.