Clustering

K-Means clustering groups similar text documents by partitioning the dataset into K clusters, revealing natural patterns and enhancing data organization. This method helps in understanding large text datasets, supporting targeted analysis and decision-making.

# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import os

# Reading and Dropping columns we don't need
df = pd.read_csv("movies.csv")
df.drop(labels=['rating', 'genre', 'released', 'director','writer','star','country','company'], axis=1,  inplace =True)
df.dropna(inplace=True)

# Applying K-means
distortions = []
K = range(1,20)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(df1_numeric_scaled)
    distortions.append(kmeanModel.inertia_)

# Plotting elbow graph
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

# K values are 2, 3 and 5

# Kmeans for K = 2
kmeans2 = KMeans(n_clusters=2, max_iter= 100)
kmeans2.fit(df1_numeric_scaled)
labels2 = kmeans2.predict(df1_numeric_scaled)
print(labels2)

#Plotting clusters
plt.scatter(label_0['year'], label_0['gross'], color = 'red')
plt.scatter(label_1['year'], label_1['gross'], color = 'blue')
plt.show()

# For K=3
kmeans3 = KMeans(n_clusters=3, max_iter= 100)
kmeans3.fit(df1_numeric_scaled)
labels3 = kmeans2.predict(df1_numeric_scaled)
print(labels3)
[0 0 0 ... 1 1 1]
# Adding cluster column , This column contains cluster labels
df3['cluster'] = kmeans3.labels_
df_kmeans3 =df3
# view clustered data
df_kmeans3

# Selecting cluster 0, cluster 1 and cluster 2 columns
# Can also do
#labels_0 =df1_numeric_scaled[labels2 ==0]
#labels_1 =df1_numeric_scaled[labels2 ==1]
#labels_2 =df1_numeric_scaled[labels2 ==2]
label_0 = df_kmeans3[df_kmeans3['cluster'] == 0]
label_1 = df_kmeans3[df_kmeans3['cluster'] == 1]
label_2 = df_kmeans3[df_kmeans3['cluster'] == 2]
#Plotting
plt.scatter(label_0['year'], label_0['gross'], color = 'red')
plt.scatter(label_1['year'], label_1['gross'], color = 'blue')
plt.scatter(label_2['year'], label_2['gross'], color = 'yellow')
plt.show()

from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree
from sklearn.cluster import AgglomerativeClustering
# Hierarchical clustering
linkagedf = df3
# Made dataframe smaller - Selected only top 100 rows
Hdf= linkagedf.iloc[0:100, 0:]
#Using Min Max scaling to convert to numeric data for the top 1000 rows
Hdf_scaled = scale.fit_transform(Hdf)
Hdf_scaled_numeric = pd.DataFrame(Hdf_scaled, index=Hdf.index, columns= Hdf.columns)
Hdf_scaled_numeric
# Distance Metric:- Euclidean , method = "ward"
Hierarchical_eculidean = linkage(Hdf_scaled_numeric, method='ward', metric='euclidean')
dendrogram(Hierarchical_eculidean, labels=Hdf_scaled_numeric.index)
plt.title("Hierarchical Clustering Using Ward Method & Euclidean Distance")
plt.xlabel('sample index')
plt.ylabel('distance (Ward)')
plt.rcParams["figure.figsize"] = (8,8)

# DBSCAN :- Density-based spatial clustering of applications
from sklearn.cluster import DBSCAN
# For DBSCAN minimum points is 2* Number of dimensions
# We need optimal 'E' parameter. To determine that we need k- nearest neighbors
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=5).fit(a)
# Find the k-neighbors of a point
neigh_dist, neigh_ind = nbrs.kneighbors(a)
sort_neigh_dist = np.sort(neigh_dist, axis=0)
k_dist = sort_neigh_dist[:, 4]
plt.plot(k_dist)
plt.axhline(y=2.5, linewidth=1, linestyle='dashed', color='k')
plt.ylabel("k-NN distance")
plt.xlabel("Sorted observations (4th NN)")
plt.show()

clusters = DBSCAN(eps=.1, min_samples=2, algorithm='ball_tree', metric='euclidean', leaf_size=90, p=2).fit(a)
from collections import Counter
Counter(clusters.labels_)
Counter({0: 5430, -1: 5})
p = sns.scatterplot(data=a, x="year", y="runtime", hue=clusters.labels_, legend="full", palette="deep")
sns.move_legend(p, "upper right", bbox_to_anchor=(1.17, 1.2), title='Clusters')
plt.show()

OUTPUT RESULTS: LINK

Conclusion

K-Means effectively organizes text data, aiding in tasks like document classification and retrieval. It simplifies data management and provides valuable insights, leading to more informed decisions and efficient analysis.