#setting the working directory and importing the dataset
import numpy as np
import pandas as pd
# Set the working directory to the folder containing the datasets
import os
os.chdir(r"C:\Users\Waneda\Documents\Dalarna Masters_Wonder\Business Intelligence\Lab4 Text Mining\star_wars_movies_scripts")
# Load the dataset from the CSV file
episode4 = pd.read_csv('episode4.csv')
episode5 = pd.read_csv('episode5.csv')
episode6 = pd.read_csv('episode6.csv')
episode4.head()
Character | Dialogue | |
---|---|---|
0 | THREEPIO | Did you hear that? They've shut down the main... |
1 | THREEPIO | We're doomed! |
2 | THREEPIO | There'll be no escape for the Princess this time. |
3 | THREEPIO | What's that? |
4 | THREEPIO | I should have known better than to trust the l... |
# Finding the top 10 characters with the most dialogue in each dataset
episode4_top10 = episode4.groupby('Character').size().reset_index(name='Dialogue Count').sort_values('Dialogue Count', ascending=False).head(10)
episode5_top10 = episode5.groupby('Character').size().reset_index(name='Dialogue Count').sort_values('Dialogue Count', ascending=False).head(10)
episode6_top10 = episode6.groupby('Character').size().reset_index(name='Dialogue Count').sort_values('Dialogue Count', ascending=False).head(10)
# Printing the top 10 characters in a table
print("Top 10 Characters with the Most Dialogue in Episode 4:\n", episode4_top10)
print("\nTop 10 Characters with the Most Dialogue in Episode 5:\n", episode5_top10)
print("\nTop 10 Characters with the Most Dialogue in Episode 6:\n", episode6_top10)
Top 10 Characters with the Most Dialogue in Episode 4: Character Dialogue Count 31 LUKE 253 25 HAN 153 50 THREEPIO 118 4 BEN 82 30 LEIA 57 53 VADER 41 41 RED LEADER 37 6 BIGGS 34 48 TARKIN 28 37 OWEN 25 Top 10 Characters with the Most Dialogue in Episode 5: Character Dialogue Count 13 HAN 181 23 LUKE 128 21 LEIA 114 40 THREEPIO 92 20 LANDO 61 43 VADER 56 47 YODA 36 29 PIETT 23 7 CREATURE 21 2 BEN 15 Top 10 Characters with the Most Dialogue in Episode 6: Character Dialogue Count 15 HAN 124 22 LUKE 112 46 THREEPIO 89 21 LEIA 55 47 VADER 43 10 EMPEROR 39 20 LANDO 39 18 JABBA 20 2 BEN 18 0 ACKBAR 14
import matplotlib.pyplot as plt
# Plotting the top 10 characters for each episode using bar charts side-by-side
fig, axs = plt.subplots(1, 3, figsize=(15,5))
axs[0].bar(episode4_top10['Character'], episode4_top10['Dialogue Count'], color='red')
axs[0].set_title('Episode 4')
axs[0].set_xticklabels(episode4_top10['Character'], rotation=90)
axs[1].bar(episode5_top10['Character'], episode5_top10['Dialogue Count'], color='green')
axs[1].set_title('Episode 5')
axs[1].set_xticklabels(episode5_top10['Character'], rotation=90)
axs[2].bar(episode6_top10['Character'], episode6_top10['Dialogue Count'], color='blue')
axs[2].set_title('Episode 6')
axs[2].set_xticklabels(episode6_top10['Character'], rotation=90)
plt.tight_layout()
plt.show()
C:\Users\v23wonma\AppData\Local\Temp\ipykernel_1956\3388183302.py:8: UserWarning: FixedFormatter should only be used together with FixedLocator axs[0].set_xticklabels(episode4_top10['Character'], rotation=90) C:\Users\v23wonma\AppData\Local\Temp\ipykernel_1956\3388183302.py:12: UserWarning: FixedFormatter should only be used together with FixedLocator axs[1].set_xticklabels(episode5_top10['Character'], rotation=90) C:\Users\v23wonma\AppData\Local\Temp\ipykernel_1956\3388183302.py:16: UserWarning: FixedFormatter should only be used together with FixedLocator axs[2].set_xticklabels(episode6_top10['Character'], rotation=90)
# Add the "episode" column to each dataframe
episode4['episode'] = 'Episode IV'
episode5['episode'] = 'Episode V'
episode6['episode'] = 'Episode VI'
# Concatenate the three dataframes into one
star_wars = pd.concat([episode4, episode5, episode6], ignore_index=True)
star_wars.head()
Character | Dialogue | episode | |
---|---|---|---|
0 | THREEPIO | Did you hear that? They've shut down the main... | Episode IV |
1 | THREEPIO | We're doomed! | Episode IV |
2 | THREEPIO | There'll be no escape for the Princess this time. | Episode IV |
3 | THREEPIO | What's that? | Episode IV |
4 | THREEPIO | I should have known better than to trust the l... | Episode IV |
from collections import Counter
# Combine all dialogues into a single string
all_dialogues = ' '.join(star_wars['Dialogue'])
# Tokenize the text into words
words = all_dialogues.split()
# Calculate the frequency distribution of words
word_freq = Counter(words)
# Get the most common words and their frequencies
most_common = word_freq.most_common(20)
# Plot the frequency distribution
word_list, freq_list = zip(*most_common)
plt.figure(figsize=(12, 6))
plt.bar(word_list, freq_list)
plt.title('Top 20 Most Common Words in Star Wars Trilogy')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
# Import the relevant packages
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Let's create a function that can perform the various indicated operations and returns the preprocessed text.
def preprocess_text(text):
# Tokenize the text
tokens = word_tokenize(text.lower())
# Remove stop words
filtered_tokens = [token for token in tokens if token not in stopwords.words('english') and token.isalpha()]
# Lemmatize the tokens
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
# Join the tokens back into a string
processed_text = ' '.join(lemmatized_tokens)
return processed_text
# apply the function df
star_wars["new_script"] = star_wars["Dialogue"].apply(preprocess_text)
star_wars.head()
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\v23wonma\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\v23wonma\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\v23wonma\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package omw-1.4 to [nltk_data] C:\Users\v23wonma\AppData\Roaming\nltk_data...
Character | Dialogue | episode | new_script | |
---|---|---|---|---|
0 | THREEPIO | Did you hear that? They've shut down the main... | Episode IV | hear shut main reactor destroyed sure madness |
1 | THREEPIO | We're doomed! | Episode IV | doomed |
2 | THREEPIO | There'll be no escape for the Princess this time. | Episode IV | escape princess time |
3 | THREEPIO | What's that? | Episode IV | |
4 | THREEPIO | I should have known better than to trust the l... | Episode IV | known better trust logic thermocapsulary dehou... |
import collections
import matplotlib.pyplot as plt
dia_list = star_wars['new_script'].tolist()
split_list = [word for line in dia_list for word in line.split()]
Counts = collections.Counter(split_list)
word_count = pd.DataFrame(Counts.most_common(50), columns=['word', 'count'])
word_count.head(20).plot.bar(x='word', y='count', color='red', title='Most frequently used words in the original trilogy')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
# Relevant packages
from wordcloud import WordCloud
from PIL import Image
# Filter the dataset for Darth Vader's dialogue
vader_dialogue = star_wars.loc[star_wars['Character'] == 'VADER', 'new_script'].str.cat(sep=' ')
# Generate a word cloud for Darth Vader
vader_mask = np.array(Image.open('vader.jpg'))
vader_wordcloud = WordCloud(mask=vader_mask, background_color='white').generate(vader_dialogue)
# Plot the word cloud for Darth Vader
plt.figure(figsize=(10, 10))
plt.imshow(vader_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Darth Vader')
plt.show()
# Filter the dataset for Yoda's dialogue
yoda_dialogue = star_wars.loc[star_wars['Character'] == 'YODA', 'new_script'].str.cat(sep=' ')
# Generate a word cloud for Yoda
yoda_mask = np.array(Image.open('yoda.png'))
yoda_wordcloud = WordCloud(mask=yoda_mask, background_color='white').generate(yoda_dialogue)
# Plot the word cloud for Yoda
plt.figure(figsize=(10, 10))
plt.imshow(yoda_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Yoda')
plt.show()
from sklearn.feature_extraction.text import TfidfVectorizer
# Combine the dialogue for all episodes into a single string
all_dialogue = star_wars['new_script'].str.cat(sep=' ')
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the dialogue using the TF-IDF vectorizer
tfidf_matrix = vectorizer.fit_transform([all_dialogue])
# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Get the TF-IDF scores for the words
tfidf_scores = tfidf_matrix.toarray()[0]
# Create a DataFrame of words and their TF-IDF scores
tfidf_df = pd.DataFrame({'Word': feature_names, 'TF-IDF Score': tfidf_scores})
# Sort the DataFrame by TF-IDF score in descending order
tfidf_df = tfidf_df.sort_values(by='TF-IDF Score', ascending=False)
# Display the top 10 most relevant words
print(tfidf_df.head(15))
Word TF-IDF Score 1276 luke 0.227311 931 get 0.215611 946 going 0.202240 1768 right 0.188869 414 come 0.187198 1447 oh 0.185526 1189 know 0.183855 1911 sir 0.150427 2343 well 0.150427 1836 see 0.148755 943 go 0.148755 133 artoo 0.135384 957 got 0.135384 951 good 0.132041 2399 yes 0.130370
nltk.download('vader_lexicon')
[nltk_data] Downloading package vader_lexicon to [nltk_data] C:\Users\v23wonma\AppData\Roaming\nltk_data...
True
from nltk.sentiment import SentimentIntensityAnalyzer
# Initialize the sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()
# Function to calculate sentiment score
def calculate_sentiment_score(text):
sentiment = sia.polarity_scores(text)
return sentiment['compound']
# Calculate sentiment for Darth Vader
vader_sentiment = star_wars.loc[star_wars['Character'] == 'VADER', 'new_script'].apply(calculate_sentiment_score).mean()
# Calculate sentiment for Emperor Palpatine
palpatine_sentiment = star_wars.loc[star_wars['Character'] == 'EMPEROR', 'new_script'].apply(calculate_sentiment_score).mean()
# Calculate sentiment for Luke
luke_sentiment = star_wars.loc[star_wars['Character'] == 'LUKE', 'new_script'].apply(calculate_sentiment_score).mean()
# Calculate sentiment for Yoda
yoda_sentiment = star_wars.loc[star_wars['Character'] == 'YODA', 'new_script'].apply(calculate_sentiment_score).mean()
# Calculate overall sentiment for the trilogy
overall_sentiment = star_wars['new_script'].apply(calculate_sentiment_score).mean()
# Display the sentiment scores
print("Sentiment for Darth Vader:", vader_sentiment)
print("Sentiment for Emperor Palpatine:", palpatine_sentiment)
print("Sentiment for Luke:", luke_sentiment)
print("Sentiment for Yoda:", yoda_sentiment)
print("Sentiment for overall trilogy:", overall_sentiment)
Sentiment for Darth Vader: 0.05819857142857142 Sentiment for Emperor Palpatine: 0.09916363636363636 Sentiment for Luke: 0.04355273833671399 Sentiment for Yoda: 0.045842857142857144 Sentiment for overall trilogy: 0.05496964640444975
# Interpretations for the sentiments are given in the Lab 4 report attached.