We use the Netflix data at this Kaggle link. This tabular dataset consists of listings of all the movies and tv shows available on Netflix, along with details such as - cast, directors, ratings, release year, duration, etc. The text data that will be analyzed is stored in the description
variable. The data can be downloaded at this link.
library(tidyverse)
library(tidytext)
library(knitr)
df <- readr::read_csv('https://bryantstats.github.io/math421/data/netflix_titles.csv')
df %>%
head(2) %>%
kable()
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description |
---|---|---|---|---|---|---|---|---|---|---|---|
s1 | TV Show | 3% | NA | João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Valente, Vaneza Oliveira, Rafael Lozano, Viviane Porto, Mel Fronckowiak, Sergio Mamberti, Zezé Motta, Celso Frateschi | Brazil | August 14, 2020 | 2020 | TV-MA | 4 Seasons | International TV Shows, TV Dramas, TV Sci-Fi & Fantasy | In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor. |
s2 | Movie | 7:19 | Jorge Michel Grau | Demián Bichir, Héctor Bonilla, Oscar Serrano, Azalia Ortiz, Octavio Michel, Carmen Beato | Mexico | December 23, 2016 | 2016 | TV-MA | 93 min | Dramas, International Movies | After a devastating earthquake hits Mexico City, trapped survivors from all walks of life wait to be rescued while trying desperately to stay alive. |
A token is a meaningful unit of text.
One row of text will be converted to multiple rows of tokens.
df %>%
unnest_tokens(input = description, output = word) %>%
anti_join(get_stopwords()) %>%
count(type, word, sort = TRUE) %>%
filter(type=='TV Show') %>%
head(10) %>%
ggplot(aes(x = n, y = reorder(word, n))) +
geom_col() +
labs(y = '', x = 'Frequency')
df %>%
unnest_tokens(input = description, output = word) %>%
anti_join(get_stopwords()) %>%
count(type, word, sort = TRUE) %>%
group_by(type) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder_within(word, by = n, within = type)) %>%
ggplot(aes(n, word, fill = type)) +
geom_col(show.legend = FALSE) +
facet_wrap(~type, scales = "free") +
labs(x = "Frequency",
y = NULL)+
scale_y_reordered()
library(wordcloud)
pal <- brewer.pal(8,"Dark2")
df %>%
unnest_tokens(input = description, output = word) %>%
anti_join(get_stopwords()) %>%
count(word, sort = TRUE) %>%
with(wordcloud(word, n, random.order = FALSE, max.words = 50, colors=pal))
library(wordcloud)
pal <- brewer.pal(8,"Dark2")
df %>%
filter(type =='Movie') %>%
unnest_tokens(input = description, output = word) %>%
anti_join(get_stopwords()) %>%
count(type, word, sort = TRUE) %>%
with(wordcloud(word, n, random.order = FALSE, max.words = 50, colors=pal))
A word is converted to a sentiment measure.
nrc
converts a word into positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, or trust.
bing
converts a word into positive or negative
AFINN
converts a word into a score that runs between -5 (max negative) and 5 (max positive).
bing
df %>%
unnest_tokens(input = description, output = word) %>%
anti_join(get_stopwords()) %>%
count(type, word, sort = TRUE) %>%
group_by(type) %>%
inner_join(get_sentiments("bing")) %>%
filter(!is.na(sentiment)) %>%
count(sentiment, sort = TRUE) %>%
group_by(type) %>%
mutate(n = n/sum(n)) %>%
ggplot(aes(type, n, fill=sentiment))+geom_col(position = 'fill')+
labs(y='Relative Frequency', x ='')
df %>%
mutate(century = if_else(release_year>=2000, '21','20')) %>%
unnest_tokens(input = description, output = word) %>%
anti_join(get_stopwords()) %>%
count(century, word, sort = TRUE) %>%
group_by(century) %>%
inner_join(get_sentiments("bing")) %>%
filter(!is.na(sentiment)) %>%
count(sentiment, sort = TRUE) %>%
group_by(century) %>%
mutate(n = n/sum(n)) %>%
ggplot(aes(century, n, fill=sentiment))+geom_col(position = 'fill')+
labs(y='Relative Frequency', x ='')
nrc
df %>%
unnest_tokens(input = description, output = word) %>%
anti_join(get_stopwords()) %>%
count(type, word, sort = TRUE) %>%
group_by(type) %>%
inner_join(get_sentiments("nrc")) %>%
filter(!is.na(sentiment)) %>%
count(sentiment, sort = TRUE) %>%
group_by(type) %>%
mutate(n = n/sum(n)) %>%
ggplot(aes(sentiment, n, fill=type))+geom_col(position = 'fill')+
labs(y='Relative Frequency', x ='')
afinn
df %>%
unnest_tokens(input = description, output = word) %>%
anti_join(get_stopwords()) %>%
count(type, word, sort = TRUE) %>%
group_by(type) %>%
inner_join(get_sentiments("afinn")) %>%
mutate(sentiment = value) %>%
filter(!is.na(sentiment)) %>%
count(sentiment, sort = TRUE) %>%
group_by(type) %>%
mutate(n = n/sum(n)) %>%
ggplot(aes(type, n, fill=factor(sentiment)))+geom_col(position = 'dodge')+
labs(y='Relative Frequency', fill = 'Sentiment', x = '')
df %>%
unnest_tokens(input = description, output = word) %>%
anti_join(get_stopwords()) %>%
count(type, word, sort = TRUE) %>%
group_by(type) %>%
inner_join(get_sentiments("afinn")) %>%
mutate(sentiment = value) %>%
filter(!is.na(sentiment)) %>%
count(sentiment, sort = TRUE) %>%
group_by(type) %>%
mutate(n = n/sum(n)) %>%
ggplot(aes(sentiment, n, fill= type))+geom_col(position = 'dodge')+
labs(y='Relative Frequency', x ='')
df %>%
unnest_tokens(input = description, output = word) %>%
anti_join(get_stopwords()) %>%
count(rating, word, sort = TRUE) %>%
group_by(rating) %>%
inner_join(get_sentiments("bing")) %>%
filter(!is.na(sentiment)) %>%
count(sentiment, sort = TRUE) %>%
group_by(rating) %>%
mutate(n = n/sum(n)) %>%
ggplot(aes(rating, n, fill=sentiment))+geom_col(position = 'dodge')+
labs(y='Relative Frequency', x ='')
We build a model to predict the type of a video (movies vs. tv show) from its description. First we need to convert the description (text) to numeric values. We use TF-IDF
method to convert the text to numeric variables.
TF (Term Frequency) = (Number of times term t appears in a document) / (Total number of terms in the document).
IDF (Inverse Document Frequency) = log_e(Total number of documents / Number of documents with term t in it).
TF-IDF = TF * IDF
Example:
Consider a document containing 100 words wherein the word cat appears 3 times. The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10 million documents and the word cat appears in one thousand of these. Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
Source: http://www.tfidf.com/
library(caret)
library(themis)
library(textrecipes)
# Select data and set target
df <- df %>%
mutate(target = type) %>%
select(target, description)
# Convert text data to numeric variables
a <- recipe(target~description,
data = df) %>%
step_tokenize(description) %>%
step_tokenfilter(description, max_tokens = 50) %>%
step_tfidf(description) %>%
step_normalize(all_numeric_predictors()) %>%
step_smote(target) %>%
prep()
df <- juice(a)
# Using Caret for modeling
set.seed(2021)
splitIndex <- createDataPartition(df$target, p = .7,
list = FALSE)
df_train <- df[ splitIndex,]
df_test <- df[-splitIndex,]
forest_ranger <- train(target~., data=df_train,
method = "ranger")
pred <- predict(forest_ranger, df_test)
cm <- confusionMatrix(data = pred, reference = df_test$target)
cm$overall[1]
## Accuracy
## 0.8037818
d = data.frame(pred = pred, obs = df_test$target)
library(yardstick)
d %>% conf_mat(pred, obs) %>% autoplot