DECISION TREES

import pandas as pd

movies=pd.read_csv(“Movie_classification.csv”)

# Discretizing the data

from sklearn import preprocessing le = preprocessing.LabelEncoder() Genre_n=le.fit_transform(movies[‘Genre’])

d=[] a=-1for i in movies[‘3D_available’]: if i==”YES”: a=1 d.append(a) elif i==”NO”: a=0 d.append(a) movies[‘3d_available’]=d

movies[‘Genre_n’]=Genre_n.tolist()

movies.drop(labels=[‘3D_available’,’Genre’,’Time_taken’],axis=1,inplace=True)

#Take out the label Collection#We are trying to predict collection X=movies.loc[:,movies.columns!=’Collection’] y=movies[‘Collection’]

#Test Train Splitimport matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

#Traning Decision Treefrom sklearn import tree regtree=tree.DecisionTreeRegressor(max_depth=3) regtree.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=3)

#Preciting values y_train_pred=regtree.predict(X_train) y_test_pred=regtree.predict(X_test)

import numpy as np a=np.around(y_test_pred) a.astype(int) print(a)

from sklearn.metrics import mean_squared_error,r2_score mean_squared_error(y_test,y_test_pred)

55040682.73466998

import sklearn.metrics as metrics print(‘Root Mean Squared Error for test data:’, np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Root Mean Squared Error for test data: 7418.9408094868895

print(‘Root Mean Squared Error for train data:’, np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))

Root Mean Squared Error for train data: 7562.667994973122

r2_score(y_train,y_train_pred) # the value obtained is 0.83 which means our model is performing great

0.8375532261107137

r2_score(y_test,y_test_pred)

0.7974110224388984

#Confusion matrix

import seaborn as sns cm=confusion_matrix(y_test,a) print(cm) plt.figure(figsize = (14,16)) sns.heatmap(cm,annot=True)

#Plotting the trees

dot_data=tree.export_graphviz(regtree, out_file=None) from IPython.display import Image import pydotplus graph=pydotplus.graph_from_dot_data(dot_data) Image(graph.create_png())


regtree1=tree.DecisionTreeRegressor(max_depth=3) regtree1.fit(X_train,y_train) dot_data=tree.export_graphviz(regtree1, out_file=None,feature_names=X_train.columns,filled=True) #filled = it will fill colors as per the conditon for the target variable = collection graph1=pydotplus.graph_from_dot_data(dot_data) Image(graph1.create_png())

regtree2=tree.DecisionTreeRegressor(min_samples_split=40) regtree2.fit(X_train,y_train) dot_data=tree.export_graphviz(regtree2, out_file=None,feature_names=X_train.columns,filled=True) graph2=pydotplus.graph_from_dot_data(dot_data) Image(graph2.create_png())

regtree3=tree.DecisionTreeRegressor(min_samples_leaf=25) regtree3.fit(X_train,y_train) dot_data=tree.export_graphviz(regtree3, out_file=None,feature_names=X_train.columns,filled=True) graph3=pydotplus.graph_from_dot_data(dot_data) Image(graph3.create_png())

#Different Tree X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.20) regtree4=tree.DecisionTreeRegressor(max_depth=3) regtree4.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=3)

#Preciting values y_train_pred1=regtree4.predict(X_train1) y_test_pred1=regtree4.predict(X_test1)

r2_score(y_train1,y_train_pred1)

OUTPUT : LINK