EXPLORATORY DATA ANALYSIS

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
%matplotlib inline
matplotlib.rcParams[‘figure.figsize’] = (12,8)

df= pd.read_csv(“movies.csv”)
pd.set_option(‘display.max_columns’, None)
print(df.columns.values)

Check for missing values

df.isna().sum()

check the dimensionality

df.shape

Drop rows with missing data

cols = [‘rating’, ‘released’, ‘score’, ‘votes’, ‘writer’, ‘star’, ‘country’, ‘budget’, ‘gross’, ‘company’, ‘runtime’]
for col in cols:
df.dropna(subset=[col], inplace=True)
df.reset_index()
df.head()

check dimensionality after dropping na values

df.shape

check datatypes for each columns

df.dtypes

Convert Budget,Gross and votes columns to integers

df[‘budget’]=df[‘budget’].astype(‘int64’)
df[‘gross’] = df[‘gross’].astype(‘int64’)
df[‘votes’] = df[‘votes’].astype(‘int64’)

Sort data by gross and view TOP 5 movies with highest gross earnings

df_top5 = df.sort_values(by=[‘gross’], ascending=False)
df_top5 = df_top5.head()
df_top5

corr = df.corr()
sns.heatmap(corr,annot=True,vmax=1)
plt.title(‘Movie Correlations for Numeric Features’, size=20,y=-0.2)
plt.xlabel(‘Features’, size=15)
plt.ylabel(‘Features’, size=15)
plt.show()

print(“From our correlation heatmap we can see that budget and gross are most correlated, therefore, we can take a closer look by creating a scatter plot”)

Scatter Plot illustrating budget vs gross

sns.regplot(x=’budget’, y=’gross’, data=df, scatter_kws={‘color’:’red’}, line_kws={‘color’:’blue’})
plt.title(‘Budget vs Gross Earnings’, size=20)
plt.xlabel(‘Film Budget (millions)’, size=15)
plt.ylabel(‘Gross Earnings (millions)’, size=15)
plt.show()

print(“From our scatter plot fitted with a regression line we can see that unsuprisnlgy, budget and gross earnings are positively correlated.”)

Convert our categorical features into numerical values

df_n = df.copy()
for col in df_n.columns:
if df_n[col].dtype == ‘object’:
df_n[col] = df_n[col].astype(‘category’).cat.codes
df_n.head()

creating a full correlation matrix with all the numeric features

full_corr = df_n.corr()
sns.heatmap(full_corr, annot=True, vmin=-1)
plt.title(‘Movie Correlations’, size=20)
plt.xlabel(‘Features’, size=15)
plt.ylabel(‘Features’, size=15)
plt.show()

corr_pairs = full_corr.unstack()
sorted_corr=corr_pairs.sort_values()
sorted_corr

high_corr=sorted_corr[sorted_corr>0.5]
not_1_high_corr=high_corr[0:4].sort_values(ascending=False)
print(not_1_high_corr)

print(“From our new correlation matrix we can see that despite the inclusion of non-numeric features (e.g. company, genre, etc.) budget and gross earning features are still the most correlated. However, we also see that votes and gross show a high correlation as well. This is understandable as movies with high gross earnings would most likely have much more voters than unpopular movies.”)

df_top5.plot(kind=’bar’, x=’name’, y=[‘budget’,’gross’], color={‘budget’:’darkred’, ‘gross’:’blue’}, figsize=(12,10))
plt.title(‘Budget and Gross Earnings for Highest Grossing Films’, size=25)
plt.xlabel(‘Movie Title’, size=15)
plt.ylabel(‘Amount(millions)’, size=15)
plt.show()

companies = df.groupby([‘company’], as_index=False)[‘budget’, ‘gross’].sum().sort_values(by=’gross’, ascending=False)
top5_companies = companies.head()
print(top5_companies)

top5_companies.plot(kind=’barh’, x=’company’, y=[‘budget’, ‘gross’],color={‘budget’:’darkred’, ‘gross’:’blue’}, figsize=(12,8))
plt.title(‘Budget and Gross Earnings for Highest Grossing Companies’, size=25)
plt.xlabel(‘Amount (100 millions)’, size=15)
plt.ylabel(‘Company’, size=15)
plt.show()

genre_gross = df.groupby([‘genre’], as_index=False)[‘gross’].sum().sort_values(by=’gross’, ascending=False)
genre_gross.reset_index()
genre_gross

genre_gross.plot(kind=’bar’, x=’genre’, y=’gross’, color=’dodgerblue’, figsize=(12,8))
plt.title(‘Budget and Gross Earnings for genres’, size=25, y=1.05)
plt.xlabel(‘Company’, size=15)
plt.ylabel(‘Amount(100 millions)’, size=15)
plt.show()

writers = df.groupby([‘writer’], as_index=False)[‘name’].count().sort_values(by=’name’, ascending=False)
writers.rename(columns={‘name’:’movies’}, inplace=True)
top_writers = writers.head(20)
top_writers

top_writers.plot(kind=’bar’, x=’writer’, y=’movies’, color=’dodgerblue’, figsize=(12,8))
plt.title(‘Writer vs Number of Movies’, size=25, y=1.05)
plt.xlabel(‘Writer’, size=15)
plt.xticks(rotation=35)
plt.ylabel(‘Number of Movies’, size=15)
plt.show()

stars=df.groupby([‘star’], as_index=False)[‘name’].count().sort_values(by=’name’,ascending=False)
stars.rename(columns={‘name’:’movies’},inplace=True)
top_stars=stars.head(30)
print(top_stars)

top_stars.plot(kind=’bar’, x=’star’, y=’movies’, color=’red’, figsize=(12,8))
plt.title(‘Star vs Number of Movies’, size=25)
plt.xlabel(‘Star’, size=15)
plt.ylabel(‘Number of Movies’, size=15)
plt.show()

df_scoring=df.sort_values(by=’score’,ascending=False)
df_scoring=df_scoring[df_scoring[‘score’]>8.6]
print(df_scoring)

import plotly.express as px

fig = px.bar(df_scoring,
x=’name’,
y=’gross’,
title=’Gross Earnings for Highest Rated Movies’,
text=’score’,
width=1100,
height=800)
fig.update_layout(xaxis_title=’Movie Title’, yaxis_title=’Gross Earning’, title_x = 0.45)
fig.update_traces(texttemplate=’%{text:.2s}’, textposition=’outside’, hovertext=’score’)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode=’hide’)
fig.update_traces(marker_color=’orangered’, marker_line_color=’black’)
fig.show()

OUTPUT: LINK