import pandas as pd
from nltk.corpus import stopwords
import nltk
from nltk.corpus import PlaintextCorpusReader
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import string
nltk.download(‘punkt’)
nltk.download(‘wordnet’)
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from IPython.display import Image
import pydotplus
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from nltk.tokenize import RegexpTokenizer
from matplotlib import pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.naive_bayes import BernoulliNB
# Combine all text to a single column
df[‘Combined_text’] = df[df.columns[0:]].apply(lambda x: ”.join(x.dropna().astype(str)),axis=1) #combines Text
df[‘Label’] = df[‘Combined_text’].apply(lambda x : x.split()[-1]) # Take the last word
df[‘Combined_text’]= df[‘Combined_text’].apply(lambda x : ‘ ‘.join(x.split(‘ ‘)[:-1])) #Remove the last word
df.replace(to_replace=[r”\t|\n|\r”, “\t|\n|\r”], value=[“”,””], regex=True, inplace=True)
df[‘Label’] = df.apply(lambda row: re.sub(“[^a-z0-9]”,” “, row[‘Label’]),axis=1)
df = df[[‘Combined_text’, ‘Label’]]
df_neg_bc = df[df[‘Label’] == ‘ pos’]
text = ” “.join(cat for cat in df_neg_bc.Combined_text)
Generate word cloud
word_cloud = WordCloud(
width=3000,
height=2000,
random_state=1,
background_color=”salmon”,
colormap=”Pastel1″,
collocations=False,
max_words=50
).generate(text)
Display Word Cloud
plt.imshow(word_cloud)
plt.axis(“off”)
plt.show()
df_pos_bc = df[df[‘Label’] == ‘ pos’]
text = ” “.join(cat for cat in df_pos_bc.Combined_text)
Generate word cloud
word_cloud = WordCloud(
width=3000,
height=2000,
random_state=1,
background_color=”grey”,
colormap=”Pastel1″,
collocations=False,
max_words=50
).generate(text)
Display Word Cloud
plt.imshow(word_cloud)
plt.axis(“off”)
plt.show()
df.replace(to_replace=[r”\t|\n|\r”, “\t|\n|\r”], value=[“”,””], regex=True, inplace=True)
df[‘Combined_text’] = df.apply(lambda row: row[‘Combined_text’].lower(),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(“@[A-Za-z0-9_]+”,””, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(“#[A-Za-z0-9_]+”,””, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(r”http\S+”,””, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(r”www.\S+”,””, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(‘[()!?]’,” “, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(‘[.*?]’,” “, row[‘Combined_text’]),axis=1)
df[‘Combined_text’] = df.apply(lambda row: re.sub(“[^a-z0-9]”,” “, row[‘Combined_text’]),axis=1)
df[‘Combined_text’]=df[‘Combined_text’].str.findall(‘\w{4,}’).str.join(‘ ‘)
df = df[[‘Combined_text’, ‘Label’]]
More Cleaning
Removing Punctuation
def remove_punctuation(text):
for element in text:
if element in string.punctuation:
text = text.replace(element, “”)
return text
df[‘Combined_text’] = df[‘Combined_text’].apply(remove_punctuation)
removing stopwords
stop =stopwords.words(‘english’)
df[‘Combined_text’] = df[‘Combined_text’].apply( lambda x : ” “.join([word for word in x.split() if word not in (stop)]))
#World clouds – after cleaning
df_neg_ac = df[df[‘Label’] == ‘ neg’]
text = ” “.join(cat for cat in df_neg_ac.Combined_text)
Generate word cloud
word_cloud = WordCloud(
width=3000,
height=2000,
random_state=1,
background_color=”salmon”,
colormap=”Pastel1″,
collocations=False,
max_words=50
).generate(text)
Display Word Cloud
plt.imshow(word_cloud)
plt.axis(“off”)
plt.show()
df_pos_ac = df[df[‘Label’] == ‘ pos’]
text = ” “.join(cat for cat in df_pos_ac.Combined_text)
Generate word cloud
word_cloud = WordCloud(
width=3000,
height=2000,
random_state=1,
background_color=”grey”,
colormap=”Pastel1″,
collocations=False,
max_words=50
).generate(text)
Display Word Cloud
plt.imshow(word_cloud)
plt.axis(“off”)
plt.show()
Lemmatizer Function
def tokenize(text):
text = ”.join([ch for ch in text if ch not in string.punctuation])
tokens = nltk.word_tokenize(text)
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(token) for token in tokens]
df[‘label’] = df[‘Label’].apply(lambda x : x.lstrip())
df.rename(columns = {‘Combined_text’ : ‘Review’}, inplace = True)
df.drop(columns = [‘Label’], axis =1 , inplace = True)
labels = df[‘label’]
CVL = CountVectorizer(tokenizer= tokenize)
Mat1 = CVL.fit_transform(df[‘Review’])
df1 = pd.DataFrame(data = Mat1.toarray(), columns = CVL.get_feature_names_out())
CVF = CountVectorizer(max_features = 30)
Mat2 = CVF.fit_transform(df[‘Review’])
df2 = pd.DataFrame(data = Mat2.toarray(), columns = CVF.get_feature_names_out())
df2
Part 3: Latent Dirichlet Allocation
Dataframe 1
vocab = CVL.get_feature_names_out()
Create the LDA Model
lda_model = LatentDirichletAllocation(n_components= 4, learning_method = “online”, max_iter=100, random_state =2 )
X_topics = lda_model.fit_transform(df1)
Get topic words
topic_words = lda_model.components_
!pip install pyLDAvis
NUM_TOPICS= 4
word_topic = np.array(lda_model.components_)
print(word_topic)
word_topic = word_topic.transpose()
num_top_words = 15
vocab_array = np.asarray(vocab)
fontsize_base = 70 / np.max(word_topic) # font size for word with largest share in corpus
fontsize_base = 14
for t in range(NUM_TOPICS):
plt.subplot(1, NUM_TOPICS, t + 1) # plot numbering starts with 1
plt.ylim(0, num_top_words + 0.5) # stretch the y-axis to accommodate the words
plt.xticks([]) # remove x-axis markings (‘ticks’)
plt.yticks([]) # remove y-axis markings (‘ticks’)
plt.title(‘Topic #{}’.format(t))
top_words_idx = np.argsort(word_topic[:,t])[::-1] # descending order
top_words_idx = top_words_idx[:num_top_words]
top_words = vocab_array[top_words_idx]
top_words_shares = word_topic[top_words_idx, t]
for i, (word, share) in enumerate(zip(top_words, top_words_shares)):
plt.text(0.1, num_top_words-i-0.5, word, fontsize=11)
##fontsize_base*share)
plt.tight_layout()
plt.show()
Dataframe 1
vocab = CVF.get_feature_names_out()
Create the LDA Model
lda_model = LatentDirichletAllocation(n_components= 3, learning_method = “online”, max_iter=100, random_state =2 )
X_topics = lda_model.fit_transform(df2)
Get topic words
topic_words = lda_model.components_
NUM_TOPICS= 3
word_topic = np.array(lda_model.components_)
print(word_topic)
word_topic = word_topic.transpose()
num_top_words = 15
vocab_array = np.asarray(vocab)
fontsize_base = 70 / np.max(word_topic) # font size for word with largest share in corpus
fontsize_base = 14
for t in range(NUM_TOPICS):
plt.subplot(1, NUM_TOPICS, t + 1) # plot numbering starts with 1
plt.ylim(0, num_top_words + 0.5) # stretch the y-axis to accommodate the words
plt.xticks([]) # remove x-axis markings (‘ticks’)
plt.yticks([]) # remove y-axis markings (‘ticks’)
plt.title(‘Topic #{}’.format(t))
top_words_idx = np.argsort(word_topic[:,t])[::-1] # descending order
top_words_idx = top_words_idx[:num_top_words]
top_words = vocab_array[top_words_idx]
top_words_shares = word_topic[top_words_idx, t]
for i, (word, share) in enumerate(zip(top_words, top_words_shares)):
plt.text(0.1, num_top_words-i-0.5, word, fontsize=11)
##fontsize_base*share)
plt.tight_layout()
plt.show()
Clustering for n = 2
kmeans_5 = KMeans(n_clusters=2)
kmeans_5.fit(df2)
clusters_5 = kmeans_5.labels_
Reducing Dimensionality for n = 2
pca = PCA(n_components=2, random_state=2)
pca_vecs = pca.fit_transform(df2.values.tolist())
a0 = pca_vecs[:, 0]
a1 = pca_vecs[:, 1]
Plotting Clusters for n = 5
plt.figure(figsize=(12, 7))
plt.title(“KMeans n = 2 Clustering”, fontdict={“fontsize”: 18})
plt.xlabel(“X Axis”, fontdict={“fontsize”: 16})
plt.ylabel(“Y Axis”, fontdict={“fontsize”: 16})
sns.scatterplot(data=df3, x=’a0_5′, y=’a1_5′, hue=’cluster_5′, palette=”viridis”, label = {0 :’pos’, 1: ‘neg’})
plt.show()
Part 6: Naïve Bayes, Decision Trees, and SVMs
In [50]:
X = df['Review']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
#Convert trainig text to document term matrix
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
train_dtm = vectorizer.transform(X_train)
#Transform testing data into document term matrix using fitted vocabulary
test_dtm = vectorizer.transform(X_test)
#train dataframe
train_dtm_df = pd.DataFrame(train_dtm.toarray(), columns = vectorizer.get_feature_names_out())
train_dtm_df
#test Dataframe
test_dtm_df = pd.DataFrame(test_dtm.toarray(), columns = vectorizer.get_feature_names_out())
test_dtm_df
#Naive Bayes
nb = BernoulliNB()
nb.fit(train_dtm,y_train)
#Confusion Matrix
cm1 = confusion_matrix(y_test, y_pred)
# Creating a dataframe for a array-formatted Confusion matrix,so it will be easy for plotting.
cm1 = pd.DataFrame(cm1,
index = ['Positive','Negative'],
columns = ['Positive','Negative'])
#Plotting the matrix
plt.figure(figsize=(7,5))
sns.heatmap(cm1, annot=True, fmt='g')
plt.title('Confusion Matrix for Naive Bayes')
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.show()
SVM
Linear Kernel
suppl = SVC(kernel = “linear”)
suppl.fit(train_dtm, y_train)
y2_predl = suppl.predict(test_dtm)
acc_svm = metrics.accuracy_score(y_test, y2_predl)
print(“Model Accuracy:”, np.round(acc_svm*100, 2), “%”)
Confusion Matrix for linear Kernel
cml = confusion_matrix(y2_predl,y_test)
cml_df = pd.DataFrame(cml,
index = [‘Positive’,’Negative’],
columns = [‘Positive’,’Negative’] )
plt.figure(figsize =(7,5))
sns.heatmap(cml_df, annot= True , fmt=’g’)
plt.title(“Confusion Matrix for Support Vector Machine – Linear Kernel”)
plt.xlabel(“Predicted Values”)
plt.ylabel(“Actual Values”)
plt.show()
Sigmoid Kernel
supps = SVC(kernel = “sigmoid”)
supps.fit(train_dtm, y_train)
y2_preds = supps.predict(test_dtm)
acc_svm = metrics.accuracy_score(y_test, y2_preds)
print(“Model Accuracy:”, np.round(acc_svm*100, 2), “%”)
Confusion Matrix for Sigmoid Kernel
cms = confusion_matrix(y2_preds,y_test)
cms_df = pd.DataFrame(cms,
index = [‘Positive’,’Negative’],
columns = [‘Positive’,’Negative’] )
plt.figure(figsize =(7,5))
sns.heatmap(cms_df, annot= True , fmt=’g’)
plt.title(“Confusion Matrix for Support Vector Machine – Sigmoid Kernel”)
plt.xlabel(“Predicted Values”)
plt.ylabel(“Actual Values”)
plt.show()
OUTPUT: LINK
