IMPLEMENTATION ON SMALL DATASET

import pandas as pd
from nltk.corpus import stopwords
import nltk
from nltk.corpus import PlaintextCorpusReader
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import string
nltk.download(‘punkt’)
nltk.download(‘wordnet’)
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from IPython.display import Image
import pydotplus
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from nltk.tokenize import RegexpTokenizer
from matplotlib import pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.naive_bayes import BernoulliNB

# Combine all text to a single column

df[‘Combined_text’] = df[df.columns[0:]].apply(lambda x: ”.join(x.dropna().astype(str)),axis=1) #combines Text
df[‘Label’] = df[‘Combined_text’].apply(lambda x : x.split()[-1]) # Take the last word
df[‘Combined_text’]= df[‘Combined_text’].apply(lambda x : ‘ ‘.join(x.split(‘ ‘)[:-1])) #Remove the last word
df.replace(to_replace=[r”\t|\n|\r”, “\t|\n|\r”], value=[“”,””], regex=True, inplace=True)
df[‘Label’] = df.apply(lambda row: re.sub(“[^a-z0-9]”,” “, row[‘Label’]),axis=1)
df = df[[‘Combined_text’, ‘Label’]]

df_neg_bc = df[df[‘Label’] == ‘ pos’]
text = ” “.join(cat for cat in df_neg_bc.Combined_text)

Generate word cloud

word_cloud = WordCloud(
width=3000,
height=2000,
random_state=1,
background_color=”salmon”,
colormap=”Pastel1″,
collocations=False,
max_words=50
).generate(text)

Display Word Cloud

plt.imshow(word_cloud)
plt.axis(“off”)
plt.show()

df_pos_bc = df[df[‘Label’] == ‘ pos’]
text = ” “.join(cat for cat in df_pos_bc.Combined_text)

Generate word cloud

word_cloud = WordCloud(
width=3000,
height=2000,
random_state=1,
background_color=”grey”,
colormap=”Pastel1″,
collocations=False,
max_words=50
).generate(text)

Display Word Cloud

plt.imshow(word_cloud)
plt.axis(“off”)
plt.show()

df.replace(to_replace=[r”\t|\n|\r”, “\t|\n|\r”], value=[“”,””], regex=True, inplace=True)
df[‘Combined_text’] = df.apply(lambda row: row[‘Combined_text’].lower(),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(“@[A-Za-z0-9_]+”,””, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(“#[A-Za-z0-9_]+”,””, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(r”http\S+”,””, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(r”www.\S+”,””, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(‘[()!?]’,” “, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(‘[.*?]’,” “, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(“[^a-z0-9]”,” “, row[‘Combined_text’]),axis=1)
df[‘Combined_text’]=df[‘Combined_text’].str.findall(‘\w{4,}’).str.join(‘ ‘)
df = df[[‘Combined_text’, ‘Label’]]

More Cleaning

Removing Punctuation

def remove_punctuation(text):
for element in text:
if element in string.punctuation:
text = text.replace(element, “”)
return text
df[‘Combined_text’] = df[‘Combined_text’].apply(remove_punctuation)

removing stopwords

stop =stopwords.words(‘english’)
df[‘Combined_text’] = df[‘Combined_text’].apply( lambda x : ” “.join([word for word in x.split() if word not in (stop)]))

#World clouds – after cleaning

df_neg_ac = df[df[‘Label’] == ‘ neg’]
text = ” “.join(cat for cat in df_neg_ac.Combined_text)

Generate word cloud

word_cloud = WordCloud(
width=3000,
height=2000,
random_state=1,
background_color=”salmon”,
colormap=”Pastel1″,
collocations=False,
max_words=50
).generate(text)

Display Word Cloud

plt.imshow(word_cloud)
plt.axis(“off”)
plt.show()

df_pos_ac = df[df[‘Label’] == ‘ pos’]
text = ” “.join(cat for cat in df_pos_ac.Combined_text)

Generate word cloud

word_cloud = WordCloud(
width=3000,
height=2000,
random_state=1,
background_color=”grey”,
colormap=”Pastel1″,
collocations=False,
max_words=50
).generate(text)

Display Word Cloud

plt.imshow(word_cloud)
plt.axis(“off”)
plt.show()

Lemmatizer Function

def tokenize(text):
text = ”.join([ch for ch in text if ch not in string.punctuation])
tokens = nltk.word_tokenize(text)
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(token) for token in tokens]

df[‘label’] = df[‘Label’].apply(lambda x : x.lstrip())

df.rename(columns = {‘Combined_text’ : ‘Review’}, inplace = True)
df.drop(columns = [‘Label’], axis =1 , inplace = True)

labels = df[‘label’]

CVL = CountVectorizer(tokenizer= tokenize)
Mat1 = CVL.fit_transform(df[‘Review’])
df1 = pd.DataFrame(data = Mat1.toarray(), columns = CVL.get_feature_names_out())

CVF = CountVectorizer(max_features = 30)
Mat2 = CVF.fit_transform(df[‘Review’])
df2 = pd.DataFrame(data = Mat2.toarray(), columns = CVF.get_feature_names_out())
df2

Part 3: Latent Dirichlet Allocation

Dataframe 1

vocab = CVL.get_feature_names_out()

Create the LDA Model

lda_model = LatentDirichletAllocation(n_components= 4, learning_method = “online”, max_iter=100, random_state =2 )
X_topics = lda_model.fit_transform(df1)

Get topic words

topic_words = lda_model.components_

!pip install pyLDAvis

NUM_TOPICS= 4
word_topic = np.array(lda_model.components_)

print(word_topic)

word_topic = word_topic.transpose()

num_top_words = 15
vocab_array = np.asarray(vocab)

fontsize_base = 70 / np.max(word_topic) # font size for word with largest share in corpus

fontsize_base = 14

for t in range(NUM_TOPICS):
plt.subplot(1, NUM_TOPICS, t + 1) # plot numbering starts with 1
plt.ylim(0, num_top_words + 0.5) # stretch the y-axis to accommodate the words
plt.xticks([]) # remove x-axis markings (‘ticks’)
plt.yticks([]) # remove y-axis markings (‘ticks’)
plt.title(‘Topic #{}’.format(t))
top_words_idx = np.argsort(word_topic[:,t])[::-1] # descending order
top_words_idx = top_words_idx[:num_top_words]
top_words = vocab_array[top_words_idx]
top_words_shares = word_topic[top_words_idx, t]
for i, (word, share) in enumerate(zip(top_words, top_words_shares)):
plt.text(0.1, num_top_words-i-0.5, word, fontsize=11)
##fontsize_base*share)

plt.tight_layout()
plt.show()

Dataframe 1

vocab = CVF.get_feature_names_out()

Create the LDA Model

lda_model = LatentDirichletAllocation(n_components= 3, learning_method = “online”, max_iter=100, random_state =2 )
X_topics = lda_model.fit_transform(df2)

Get topic words

topic_words = lda_model.components_

NUM_TOPICS= 3
word_topic = np.array(lda_model.components_)

print(word_topic)

word_topic = word_topic.transpose()

num_top_words = 15
vocab_array = np.asarray(vocab)

fontsize_base = 70 / np.max(word_topic) # font size for word with largest share in corpus

fontsize_base = 14

for t in range(NUM_TOPICS):
plt.subplot(1, NUM_TOPICS, t + 1) # plot numbering starts with 1
plt.ylim(0, num_top_words + 0.5) # stretch the y-axis to accommodate the words
plt.xticks([]) # remove x-axis markings (‘ticks’)
plt.yticks([]) # remove y-axis markings (‘ticks’)
plt.title(‘Topic #{}’.format(t))
top_words_idx = np.argsort(word_topic[:,t])[::-1] # descending order
top_words_idx = top_words_idx[:num_top_words]
top_words = vocab_array[top_words_idx]
top_words_shares = word_topic[top_words_idx, t]
for i, (word, share) in enumerate(zip(top_words, top_words_shares)):
plt.text(0.1, num_top_words-i-0.5, word, fontsize=11)
##fontsize_base*share)

plt.tight_layout()
plt.show()

Clustering for n = 2

kmeans_5 = KMeans(n_clusters=2)
kmeans_5.fit(df2)
clusters_5 = kmeans_5.labels_

Reducing Dimensionality for n = 2

pca = PCA(n_components=2, random_state=2)
pca_vecs = pca.fit_transform(df2.values.tolist())
a0 = pca_vecs[:, 0]
a1 = pca_vecs[:, 1]

Plotting Clusters for n = 5

plt.figure(figsize=(12, 7))
plt.title(“KMeans n = 2 Clustering”, fontdict={“fontsize”: 18})
plt.xlabel(“X Axis”, fontdict={“fontsize”: 16})
plt.ylabel(“Y Axis”, fontdict={“fontsize”: 16})
sns.scatterplot(data=df3, x=’a0_5′, y=’a1_5′, hue=’cluster_5′, palette=”viridis”, label = {0 :’pos’, 1: ‘neg’})
plt.show()

Part 6: Naïve Bayes, Decision Trees, and SVMs

In [50]:

X = df['Review']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

#Convert trainig text to document term matrix
vectorizer  = CountVectorizer()
vectorizer.fit(X_train)
train_dtm = vectorizer.transform(X_train)
#Transform testing data into document term matrix using fitted vocabulary
test_dtm = vectorizer.transform(X_test)




#train dataframe
train_dtm_df = pd.DataFrame(train_dtm.toarray(), columns = vectorizer.get_feature_names_out())
train_dtm_df

#test Dataframe
test_dtm_df = pd.DataFrame(test_dtm.toarray(), columns = vectorizer.get_feature_names_out())
test_dtm_df

#Naive Bayes

nb = BernoulliNB()
nb.fit(train_dtm,y_train)

#Confusion Matrix
cm1 = confusion_matrix(y_test, y_pred)

# Creating a dataframe for a array-formatted Confusion matrix,so it will be easy for plotting.
cm1 = pd.DataFrame(cm1,
index = ['Positive','Negative'],
columns = ['Positive','Negative'])

#Plotting the matrix
plt.figure(figsize=(7,5))
sns.heatmap(cm1, annot=True, fmt='g')
plt.title('Confusion Matrix for Naive Bayes')
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.show()

SVM

Linear Kernel

suppl = SVC(kernel = “linear”)
suppl.fit(train_dtm, y_train)
y2_predl = suppl.predict(test_dtm)
acc_svm = metrics.accuracy_score(y_test, y2_predl)
print(“Model Accuracy:”, np.round(acc_svm*100, 2), “%”)

Confusion Matrix for linear Kernel

cml = confusion_matrix(y2_predl,y_test)
cml_df = pd.DataFrame(cml,
index = [‘Positive’,’Negative’],
columns = [‘Positive’,’Negative’] )

plt.figure(figsize =(7,5))
sns.heatmap(cml_df, annot= True , fmt=’g’)
plt.title(“Confusion Matrix for Support Vector Machine – Linear Kernel”)
plt.xlabel(“Predicted Values”)
plt.ylabel(“Actual Values”)
plt.show()

Sigmoid Kernel

supps = SVC(kernel = “sigmoid”)
supps.fit(train_dtm, y_train)
y2_preds = supps.predict(test_dtm)
acc_svm = metrics.accuracy_score(y_test, y2_preds)
print(“Model Accuracy:”, np.round(acc_svm*100, 2), “%”)

Confusion Matrix for Sigmoid Kernel

cms = confusion_matrix(y2_preds,y_test)
cms_df = pd.DataFrame(cms,
index = [‘Positive’,’Negative’],
columns = [‘Positive’,’Negative’] )

plt.figure(figsize =(7,5))
sns.heatmap(cms_df, annot= True , fmt=’g’)
plt.title(“Confusion Matrix for Support Vector Machine – Sigmoid Kernel”)
plt.xlabel(“Predicted Values”)
plt.ylabel(“Actual Values”)
plt.show()

OUTPUT: LINK