import pandas as pd
movies=pd.read_csv(“Movie_classification.csv”)
# Discretizing the data
from sklearn import preprocessing le = preprocessing.LabelEncoder() Genre_n=le.fit_transform(movies[‘Genre’])
d=[] a=-1for i in movies[‘3D_available’]: if i==”YES”: a=1 d.append(a) elif i==”NO”: a=0 d.append(a) movies[‘3d_available’]=d
movies[‘Genre_n’]=Genre_n.tolist()
movies.drop(labels=[‘3D_available’,’Genre’,’Time_taken’],axis=1,inplace=True)
#Take out the label Collection#We are trying to predict collection X=movies.loc[:,movies.columns!=’Collection’] y=movies[‘Collection’]
#Test Train Splitimport matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
#Traning Decision Treefrom sklearn import tree regtree=tree.DecisionTreeRegressor(max_depth=3) regtree.fit(X_train,y_train)
DecisionTreeRegressor(max_depth=3)
#Preciting values y_train_pred=regtree.predict(X_train) y_test_pred=regtree.predict(X_test)
import numpy as np a=np.around(y_test_pred) a.astype(int) print(a)
from sklearn.metrics import mean_squared_error,r2_score mean_squared_error(y_test,y_test_pred)
55040682.73466998
import sklearn.metrics as metrics print(‘Root Mean Squared Error for test data:’, np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))
Root Mean Squared Error for test data: 7418.9408094868895
print(‘Root Mean Squared Error for train data:’, np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
Root Mean Squared Error for train data: 7562.667994973122
r2_score(y_train,y_train_pred) # the value obtained is 0.83 which means our model is performing great
0.8375532261107137
r2_score(y_test,y_test_pred)
0.7974110224388984
#Confusion matrix
import seaborn as sns cm=confusion_matrix(y_test,a) print(cm) plt.figure(figsize = (14,16)) sns.heatmap(cm,annot=True)
#Plotting the trees
dot_data=tree.export_graphviz(regtree, out_file=None) from IPython.display import Image import pydotplus graph=pydotplus.graph_from_dot_data(dot_data) Image(graph.create_png())
regtree1=tree.DecisionTreeRegressor(max_depth=3) regtree1.fit(X_train,y_train) dot_data=tree.export_graphviz(regtree1, out_file=None,feature_names=X_train.columns,filled=True) #filled = it will fill colors as per the conditon for the target variable = collection graph1=pydotplus.graph_from_dot_data(dot_data) Image(graph1.create_png())
regtree2=tree.DecisionTreeRegressor(min_samples_split=40) regtree2.fit(X_train,y_train) dot_data=tree.export_graphviz(regtree2, out_file=None,feature_names=X_train.columns,filled=True) graph2=pydotplus.graph_from_dot_data(dot_data) Image(graph2.create_png())
regtree3=tree.DecisionTreeRegressor(min_samples_leaf=25) regtree3.fit(X_train,y_train) dot_data=tree.export_graphviz(regtree3, out_file=None,feature_names=X_train.columns,filled=True) graph3=pydotplus.graph_from_dot_data(dot_data) Image(graph3.create_png())
#Different Tree X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.20) regtree4=tree.DecisionTreeRegressor(max_depth=3) regtree4.fit(X_train,y_train)
DecisionTreeRegressor(max_depth=3)
#Preciting values y_train_pred1=regtree4.predict(X_train1) y_test_pred1=regtree4.predict(X_test1)
r2_score(y_train1,y_train_pred1)
OUTPUT : LINK
