For Data Analysis, we have selected a database that contains information about movies.This database has details about the list of movies with the time and date they have released, Also name of the director,productuon company and the cast details.It also has the budget and revenue details for most of the movies. Our Approach will start with Data Wrangling and cleaning where we will load the data and start cleaning the database to make the data more approprate for data analysing.Then we will perform data anaysis on the cleaned data and communicate the results through visualization Below are the different factors are considered for this Analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Load your data and print out a few lines
df = pd.read_csv('tmdb-movies.csv')
#Read the data
df.head(2)
#change the datatype of genre from float to string
df['genres'] = df['genres'].astype(str)
#Considering only the first genre as the genre of the film
df['genres'] = df['genres'].apply(lambda x: x.split("|")[0])
#dropping the rows which has no genre listed.
df.drop(df.query('genres == "nan"').index,axis=0,inplace=True)
#confirm changes
df.info()
#dropping few unwanted columns
df.drop(['tagline','imdb_id','homepage','keywords','overview','director','production_companies'],axis=1,inplace=True)
#confirm changes
df.info()
#1 Number of Movies released on each genre for the past 5 years
df_5 = df[df['release_year'] > 2010]
gen_ct = df_5.groupby(['genres'])['id'].count()
Action = gen_ct['Action']
Adventure = gen_ct['Adventure']
Animation = gen_ct['Animation']
Comedy = gen_ct['Comedy']
Crime = gen_ct['Crime']
Documentary = gen_ct['Documentary']
Drama = gen_ct['Drama']
Family = gen_ct['Family']
Fantasy = gen_ct['Fantasy']
Foreign = gen_ct['Foreign']
History = gen_ct['History']
Horror = gen_ct['Horror']
Music = gen_ct['Music']
Mystery = gen_ct['Mystery']
Romance = gen_ct['Romance']
Science_Fiction = gen_ct['Science Fiction']
TV_Movie = gen_ct['TV Movie']
Thriller = gen_ct['Thriller']
War = gen_ct['War']
Western = gen_ct['Western']
#Visualization
N=19
locations = np.arange(N)
gens = [Action,Adventure,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,History,Horror,Music,Mystery,Romance,Science_Fiction,TV_Movie,Thriller,War,Western]
labels = ['Action','Adventure','Comedy','Crime','Documentary','Drama','Family','Fantasy','Foreign','History','Horror','Music','Mystery','Romance','Science_Fiction','TV_Movie','Thriller','War','Western']
plt.bar(locations,gens,tick_label=labels),
plt.xticks(locations,rotation=90)
plt.title('No.of Movies by genres')
plt.xlabel('genres')
plt.ylabel('No.of Movies')
#COMMUNICATION - In the Last 5 years, Drama genre tops the list with more number of movies followed by comedy genre.
df.describe()
#2 - Genre Vs Revenue_adj
df_r = df.groupby(['genres'])['revenue_adj'].mean()
Action = df_r['Action'].round()
Adventure = df_r['Adventure'].round()
Animation = df_r['Animation'].round()
Comedy = df_r['Comedy'].round()
Crime = df_r['Crime'].round()
Documentary = df_r['Documentary'].round()
Drama = df_r['Drama'].round()
Family = df_r['Family'].round()
Fantasy = df_r['Fantasy'].round()
#Foreign = df_r['Foreign'].round()
History = df_r['History'].round()
Horror = df_r['Horror'].round()
Music = df_r['Music'].round()
Mystery = df_r['Mystery'].round()
Romance = df_r['Romance'].round()
Science_Fiction = df_r['Science Fiction'].round()
TV_Movie = df_r['TV Movie'].round()
Thriller = df_r['Thriller'].round()
War = df_r['War'].round()
Western = df_r['Western'].round()
#Visualization
N=18
locations = np.arange(N)
gens = [Action,Adventure,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science_Fiction,TV_Movie,Thriller,War,Western]
labels = ['Action','Adventure','Comedy','Crime','Documentary','Drama','Family','Fantasy','History','Horror','Music','Mystery','Romance','Science_Fiction','TV_Movie','Thriller','War','Western']
plt.bar(locations,gens,tick_label=labels),
plt.xticks(locations,rotation=90)
plt.title('Genre Vs Revenue_adj')
plt.xlabel('Genres')
plt.ylabel('Revenue_adj')
# COMMUNICATION - Adventure movies produced more revenue when compared to all other genres, Also science_fiction movies
# as well produced good amount of revenue.
#3 - Genre Vs Budjet_adj
df_b = df.groupby(['genres'])['budget_adj'].mean()
Action = df_b['Action'].round()
Adventure = df_b['Adventure'].round()
Animation = df_b['Animation'].round()
Comedy = df_b['Comedy'].round()
Crime = df_b['Crime'].round()
Documentary = df_b['Documentary'].round()
Drama = df_b['Drama'].round()
Family = df_b['Family'].round()
Fantasy = df_b['Fantasy'].round()
#Foreign = df_b['Foreign'].round()
History = df_b['History'].round()
Horror = df_b['Horror'].round()
Music = df_b['Music'].round()
Mystery = df_b['Mystery'].round()
Romance = df_b['Romance'].round()
Science_Fiction = df_b['Science Fiction'].round()
TV_Movie = df_b['TV Movie'].round()
Thriller = df_b['Thriller'].round()
War = df_b['War'].round()
Western = df_b['Western'].round()
#Visualization
N=18
locations = np.arange(N)
gens = [Action,Adventure,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science_Fiction,TV_Movie,Thriller,War,Western]
labels = ['Action','Adventure','Comedy','Crime','Documentary','Drama','Family','Fantasy','History','Horror','Music','Mystery','Romance','Science_Fiction','TV_Movie','Thriller','War','Western']
plt.bar(locations,gens,tick_label=labels),
plt.xticks(locations,rotation=90)
plt.title('Genre Vs Budget_adj')
plt.xlabel('Genres')
plt.ylabel('Budget_adj')
# COMMUNICATION - Adventure genre movies are made with huge budget than any other genres
# also they are producing more revenue. Suprisingly Action movies are made with more budjet but producing less revenue.
#4 - Genre Vs Popularity
df_p = df.groupby(['genres'])['popularity'].mean()
Action = df_p['Action']
Adventure = df_p['Adventure']
Animation = df_p['Animation']
Comedy = df_p['Comedy']
Crime = df_p['Crime']
Documentary = df_p['Documentary']
Drama = df_p['Drama']
Family = df_p['Family']
Fantasy = df_p['Fantasy']
#Foreign = df_b['Foreign'].round()
History = df_p['History']
Horror = df_p['Horror']
Music = df_p['Music']
Mystery = df_p['Mystery']
Romance = df_p['Romance']
Science_Fiction = df_p['Science Fiction']
TV_Movie = df_p['TV Movie']
Thriller = df_p['Thriller']
War = df_p['War']
Western = df_p['Western']
#Visualization
N=18
locations = np.arange(N)
gens = [Action,Adventure,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science_Fiction,TV_Movie,Thriller,War,Western]
labels = ['Action','Adventure','Comedy','Crime','Documentary','Drama','Family','Fantasy','History','Horror','Music','Mystery','Romance','Science_Fiction','TV_Movie','Thriller','War','Western']
plt.bar(locations,gens,tick_label=labels),
plt.xticks(locations,rotation=90)
plt.title('Genre Vs Popularity')
plt.xlabel('Genres')
plt.ylabel('Popularity')
#Communication - Adventure and Science Fiction are more popular among the people and Documentry gets least poplularity.
#5 Genre vs Vote_Avg
df_v = df.groupby(['genres'])['vote_average'].mean()
Action = df_v['Action']
Adventure = df_v['Adventure']
Animation = df_v['Animation']
Comedy = df_v['Comedy']
Crime = df_v['Crime']
Documentary = df_v['Documentary']
Drama = df_v['Drama']
Family = df_v['Family']
Fantasy = df_v['Fantasy']
#Foreign = df_b['Foreign'].round()
History = df_v['History']
Horror = df_v['Horror']
Music = df_v['Music']
Mystery = df_v['Mystery']
Romance = df_v['Romance']
Science_Fiction = df_v['Science Fiction']
TV_Movie = df_v['TV Movie']
Thriller = df_v['Thriller']
War = df_v['War']
Western = df_v['Western']
#Visualization
N=18
locations = np.arange(N)
gens = [Action,Adventure,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science_Fiction,TV_Movie,Thriller,War,Western]
labels = ['Action','Adventure','Comedy','Crime','Documentary','Drama','Family','Fantasy','History','Horror','Music','Mystery','Romance','Science_Fiction','TV_Movie','Thriller','War','Western']
plt.bar(locations,gens,tick_label=labels),
plt.xticks(locations,rotation=90)
plt.title('Genre Vs vote_average')
plt.xlabel('Genres')
plt.ylabel('vote_average')
#Communication: Documentry tops the list with highest voting Average than any other genres.
#6 Budjet_adj vs Revenue_adj
df_r = df.groupby(['genres'])['revenue_adj'].mean()
df_b = df.groupby(['genres'])['budget_adj'].mean()
plt.scatter(df_r,df_b)
plt.title('Budget_adj Vs Revenue_adj')
plt.xlabel('Revenue_adj')
plt.ylabel('Budget_adj')
#Communication: Budget and Revenue shows a good postive co-relation.
#Conclusions:
# Able to analyze the movie dataset by considering genre as a main factor.
# Able to predict which could be possibly be the genre that people like or dislike most
# Able to predict the relation between budget and revenue factors
# we can further investigate this dataset by taking other factors like production companies,cast list,runtime etc.
from subprocess import call
call(['python', '-m', 'nbconvert', 'Investigate_a_Dataset.ipynb'])