tweet_classifications <-
readRDS("../storage/MMc_TwitterLeicester2018-2019_all-en_clr-emo-lemm-multi_no-spam_with-all-tweet-classif-v0-2-0.rds")
tweet_classifications %>%
ggplot(aes(x = tweet_sentimentr_sentiment)) +
geom_histogram(binwidth = 0.1) +
ylim(-100, 350000) +
theme_bw()
tweet_classifications %>%
filter(tweet_flair_sentiment_value == "POSITIVE") %>%
ggplot(aes(x = tweet_flair_sentiment_confidence)) +
geom_histogram(binwidth = 0.1) +
xlim(-0.1,1.1) +
ylim(-100, 350000) +
theme_bw()
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
tweet_classifications %>%
filter(tweet_flair_sentiment_value == "NEGATIVE") %>%
ggplot(aes(x = tweet_flair_sentiment_confidence)) +
geom_histogram(binwidth = 0.1) +
xlim(-0.1,1.1) +
ylim(-100, 350000) +
theme_bw()
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
Based on the distributions illustrated above, the values produced by
sentimentr
are re-classified as positive if the sentiment
value is greater than zero, negative if lower than zero, and neutral if
zero. Similarly, values produced by flair
are re-classified
as neutral if the confidence value is lower than 0.95
and
according to the sentiment value otherwise.
tweet_classifications <-
tweet_classifications %>%
mutate(
tweet_sentimentr_class = ordered(
case_when(
tweet_sentimentr_sentiment <= -0.1 ~ "Negative",
tweet_sentimentr_sentiment >= 0.1 ~ "Positive",
TRUE ~ "Neutral"
),
levels = c("Negative", "Neutral", "Positive")
),
tweet_flair_sentiment_class = ordered(
case_when(
tweet_flair_sentiment_confidence < 0.95 ~ "Neutral",
tweet_flair_sentiment_value == "NEGATIVE" ~ "Negative",
tweet_flair_sentiment_value == "POSITIVE" ~ "Positive"
),
levels = c("Negative", "Neutral", "Positive")
)
)
tweet_classifications %>%
select(tweet_flair_e6c11m2_admiration:tweet_flair_e6c11m2_top_emotion) %>%
pivot_longer(
cols = tweet_flair_e6c11m2_admiration:tweet_flair_e6c11m2_sadness,
names_to = "emotion",
values_to = "tweet_flair_e6c11m2_top_emotion_confidence"
) %>%
mutate(
emotion = str_remove(emotion, "tweet_flair_e6c11m2_")
) %>%
filter(
emotion == tweet_flair_e6c11m2_top_emotion
) %>%
ggplot(aes(x = tweet_flair_e6c11m2_top_emotion_confidence)) +
geom_histogram(binwidth = 0.01)
tweet_classifications %>%
select(tweet_flair_c6c12m1_commercial:tweet_flair_c6c12m1_top_context) %>%
pivot_longer(
cols = tweet_flair_c6c12m1_commercial:tweet_flair_c6c12m1_place_character,
names_to = "context",
values_to = "tweet_flair_c6c12m1_top_context_confidence"
) %>%
mutate(
context = str_remove(context, "tweet_flair_c6c12m1_")
) %>%
filter(
context == (tweet_flair_c6c12m1_top_context %>%
str_replace_all(" and ", "_") %>%
str_replace_all(", ", "_") %>%
str_replace_all(" ", "_"))
) %>%
ggplot(aes(x = tweet_flair_c6c12m1_top_context_confidence)) +
geom_histogram(binwidth = 0.01)
Based on the distributions illustrated above, the values produced by
the flair
text classifier models are re-classified as
uncertain
if the confidence value is lower than
0.95
and according to the model otherwise.
tweet_classifications <-
tweet_classifications %>%
left_join(
tweet_classifications %>%
select(tweet_id_str, tweet_flair_e6c11m2_admiration:tweet_flair_e6c11m2_top_emotion) %>%
pivot_longer(
cols = tweet_flair_e6c11m2_admiration:tweet_flair_e6c11m2_sadness,
names_to = "emotion",
values_to = "tweet_flair_e6c11m2_top_emotion_confidence"
) %>%
mutate(
emotion = str_remove(emotion, "tweet_flair_e6c11m2_")
) %>%
filter(
emotion == tweet_flair_e6c11m2_top_emotion
) %>%
select(-emotion)
) %>%
relocate(
tweet_flair_e6c11m2_top_emotion_confidence,
.after = tweet_flair_e6c11m2_top_emotion
) %>%
left_join(
tweet_classifications %>%
select(tweet_id_str, tweet_flair_c6c12m1_commercial:tweet_flair_c6c12m1_top_context) %>%
pivot_longer(
cols = tweet_flair_c6c12m1_commercial:tweet_flair_c6c12m1_place_character,
names_to = "context",
values_to = "tweet_flair_c6c12m1_top_context_confidence"
) %>%
mutate(
context = str_remove(context, "tweet_flair_c6c12m1_")
) %>%
filter(
context == (tweet_flair_c6c12m1_top_context %>%
str_replace_all(" and ", "_") %>%
str_replace_all(", ", "_") %>%
str_replace_all(" ", "_"))
) %>%
select(-context)
) %>%
relocate(
tweet_flair_c6c12m1_top_context_confidence,
.after = tweet_flair_c6c12m1_top_context
) %>%
mutate(
tweet_flair_e6c11m2_top_emotion = if_else(
tweet_flair_e6c11m2_top_emotion_confidence < 0.95,
"uncertain", tweet_flair_e6c11m2_top_emotion
),
tweet_flair_c6c12m1_top_context = if_else(
tweet_flair_c6c12m1_top_context_confidence < 0.95,
"uncertain", tweet_flair_c6c12m1_top_context
)
)
## Joining, by = c("tweet_id_str", "tweet_flair_e6c11m2_top_emotion")
## Joining, by = c("tweet_id_str", "tweet_flair_c6c12m1_top_context")
Double-check result.
tweet_classifications %>%
count(tweet_flair_e6c11m2_top_emotion) %>%
kable()
tweet_flair_e6c11m2_top_emotion | n |
---|---|
admiration | 73547 |
annoyance | 38992 |
anticipation | 40162 |
excitement | 37889 |
gratitude | 10959 |
interest | 39486 |
joy | 579108 |
sadness | 29222 |
uncertain | 623 |
tweet_classifications %>%
count(tweet_flair_c6c12m1_top_context) %>%
kable()
tweet_flair_c6c12m1_top_context | n |
---|---|
commercial | 55849 |
community events | 29706 |
connecting and sharing | 194357 |
family, friendship and relationships | 19574 |
health, fitness and wellbeing | 28137 |
leisure, hobbies and interests | 493237 |
local environment | 7490 |
place character | 9964 |
uncertain | 11674 |
In this section, we look at the relationship between the different classifications, exploring the number of tweets that are classified at the intersection between classes from different classifications.
The find_match
generates a cross-tabulation of two
classifications and identifies as matching the classes the
share the highest number of elements falling at their intersection.
find_match <- function(data, a, b){
# Create crosstabulation
# as a long data frame
crosstab_df <- table(
data %>% pull(.data[[a]]),
data %>% pull(.data[[b]]),
dnn = c(a, b)
) %>%
as.data.frame(stringsAsFactors = FALSE)
matching <- tibble(class_a = c(NA), class_b = c(NA))
# Find best match
while( nrow(crosstab_df) > 0 ){
# Select the highest value for match for class A
this_class_a <-
crosstab_df %>%
slice_max(order_by = Freq, n = 1, with_ties = FALSE) %>%
pull(.data[[a]])
# Select the respective highest value among intersecting class B
this_class_b <-
crosstab_df %>%
filter(.data[[a]] == this_class_a) %>%
slice_max(order_by = Freq, n = 1, with_ties = FALSE) %>%
pull(.data[[b]])
# Add them as a match
crosstab_df <-
crosstab_df %>%
filter(
.data[[a]] != this_class_a &
.data[[b]] != this_class_b
)
#cat(this_class_a, this_class_b, "\n")
matching <-
matching %>%
add_row(
class_a = this_class_a,
class_b = this_class_b
)
}
# Add NAs for classes without matches
matching <-
matching %>%
filter(!is.na(class_a))
missing_a <-
data %>%
filter(
!(.data[[a]] %in% (matching %>% pull(class_a))) &
!(is.na(.data[[a]]))
) %>%
pull(.data[[a]]) %>%
unique()
if (length(missing_a) > 0) {
for (a_missing_a in missing_a) {
matching <-
matching %>%
add_row(
class_a = a_missing_a %>% as.character(),
class_b = NA
)
}
}
missing_b <-
data %>%
filter(
!(.data[[b]] %in% (matching %>% pull(class_b))) &
!(is.na(.data[[b]]))
) %>%
pull(.data[[b]]) %>%
unique()
if (length(missing_b) > 0) {
for (a_missing_b in missing_b) {
matching <-
matching %>%
add_row(
class_a = NA,
class_b = a_missing_b %>% as.character()
)
}
}
colnames(matching) <- c(a, b)
matching
}
The match_and_plot
creates a heatmap
illustrating the number of cases at the intersection of each classes of
two classifications, using geom_tile
to generate the
visualisation and find_match
to make the visualisation
clearer by sorting the classes so that if there is correlation among the
two classification a clear diagonal of high values should appear.
match_and_plot <- function(data, a, a_title, b, b_title){
this_mapping <- find_match(data, a, b)
data_tile_plot_main <-
data %>%
count(.data[[a]], .data[[b]]) %>%
ggplot(
aes(
x = ordered(
.data[[a]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
y = ordered(
.data[[b]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
fill = n
)
) +
geom_tile() +
#ggplot2::scale_fill_viridis_b(
ggplot2::scale_fill_viridis_c(
trans = "log",
# breaks = c(1000, 3000, 10000, 30000, 100000, 300000),
# labels = c("1,000", "3,000", "10,000", "30,000", "100,000", "300,000"),
breaks = breaks_log(n = 6, base = 10),
labels = label_comma(),
name = "Count"
) +
xlab(a_title) +
ylab(b_title) +
coord_fixed(ratio = 1) +
ggplot2::theme_bw() +
ggplot2::theme(
axis.text.x =
element_text(
angle = 45,
vjust = 1,
hjust = 1
)
)
data_tile_plot_A <-
data %>%
count(.data[[a]], .data[[b]]) %>%
group_by(.data[[a]]) %>%
mutate(perc = (n/sum(n))*100) %>%
ggplot(
aes(
x = ordered(
.data[[a]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
y = ordered(
.data[[b]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
fill = perc
)
) +
geom_tile() +
#ggplot2::scale_fill_viridis_b(
ggplot2::scale_fill_viridis_c(
trans = "log",
# breaks = c(1, 2.5, 5, 10, 25, 50),
# labels = c("1", "2.5", "5", "10", "25", "50"),
breaks = breaks_log(n = 6, base = 2),
labels = label_comma(),
name = paste("Percentage\nby", a_title)
) +
#xlab("Predicted context") +
xlab("") +
#ylab("Predicted emotion") +
ylab("") +
coord_fixed(ratio = 1) +
ggplot2::theme_bw() +
ggplot2::theme(
axis.text.x =
element_text(
angle = 45,
vjust = 1,
hjust = 1
)
)
data_tile_plot_B <-
data %>%
count(.data[[a]], .data[[b]]) %>%
group_by(.data[[b]]) %>%
mutate(perc = (n/sum(n))*100) %>%
ggplot(
aes(
x = ordered(
.data[[a]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
y = ordered(
.data[[b]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
fill = perc
)
) +
geom_tile() +
#ggplot2::scale_fill_viridis_b(
ggplot2::scale_fill_viridis_c(
trans = "log",
# breaks = c(1, 2.5, 5, 10, 25, 50),
# labels = c("1", "2.5", "5", "10", "25", "50"),
breaks = breaks_log(n = 6, base = 2),
labels = label_comma(),
name = paste("Percentage\nby", b_title)
) +
#xlab("Predicted context") +
xlab("") +
#ylab("Predicted emotion") +
ylab("") +
coord_fixed(ratio = 1) +
ggplot2::theme_bw() +
ggplot2::theme(
axis.text.x =
element_text(
angle = 45,
vjust = 1,
hjust = 1
)
)
data_tile_all <-
(data_tile_plot_main + theme(plot.margin = unit(c(0,0,0,0), "pt"))) +
(
(data_tile_plot_A + theme(plot.margin = unit(c(0,0,0,0), "pt"))) /
(data_tile_plot_B + theme(plot.margin = unit(c(0,0,0,0), "pt")))
)
data_tile_all
# ggsave(
# paste0(
# "../images/MMc-compare-classifications-tweets_",
# a, "_", b, ".pdf"
# ),
# width = 400, height = 200, units = "mm",
# data_tile_all
# )
}
The match_and_plot_top
function is similar to the
match_and_plot
function above, but only the top 20 classes
for each classification are visualised, that is the 20 classes (with
ties) from each classification that together have the highest number of
cases at their intersection.
match_and_plot_top <- function(data, a, a_title, b, b_title){
this_mapping <- find_match(data, a, b)
top_perc <-
bind_rows(
data %>%
count(.data[[a]]) %>%
slice_max(order_by = n, n = 20) %>%
left_join(
data %>%
count(.data[[a]], .data[[b]]) %>%
filter(
!is.na(.data[[a]]) &
!is.na(.data[[b]])
) %>%
group_by(.data[[a]]) %>%
mutate(
perc = (n/sum(n))*100
) %>%
slice_max(order_by = perc, n = 1) %>%
select(.data[[a]], .data[[b]])
),
data %>%
count(.data[[b]]) %>%
slice_max(order_by = n, n = 20) %>%
left_join(
data %>%
count(.data[[a]], .data[[b]]) %>%
filter(
!is.na(.data[[a]]) &
!is.na(.data[[b]])
) %>%
group_by(.data[[b]]) %>%
mutate(
perc = (n/sum(n))*100
) %>%
slice_max(order_by = perc, n = 1) %>%
select(.data[[a]], .data[[b]])
)
)
data_tile_plot_main <-
data %>%
count(.data[[a]], .data[[b]]) %>%
ggplot(
aes(
x = ordered(
.data[[a]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
y = ordered(
.data[[b]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
fill = n
)
) +
geom_tile() +
#ggplot2::scale_fill_viridis_c(
ggplot2::scale_fill_viridis_b(
trans = "log",
# breaks = c(10, 30, 100, 300, 1000, 3000),
# labels = c("10", "30", "100", "300", "1,000", "3,000"),
breaks = breaks_log(n = 6, base = 2),
labels = label_comma(),
name = "Count"
) +
xlab(a_title) +
ylab(b_title) +
coord_fixed(ratio = 1) +
ggplot2::theme_bw() +
ggplot2::theme(
axis.text.x =
element_text(
angle = 45,
vjust = 1,
hjust = 1
)
) +
xlim(top_perc %>% pull(.data[[a]]) %>% unique()) +
ylim(top_perc %>% pull(.data[[b]]) %>% unique())
# xlim(data %>% count(.data[[a]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[a]]) %>% unique()) +
# ylim(data %>% count(.data[[b]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[b]]) %>% unique())
data_tile_plot_A <-
data %>%
count(.data[[a]], .data[[b]]) %>%
group_by(.data[[a]]) %>%
mutate(perc = (n/sum(n))*100) %>%
ungroup() %>%
ggplot(
aes(
x = ordered(
.data[[a]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
y = ordered(
.data[[b]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
fill = perc
)
) +
geom_tile() +
# ggplot2::scale_fill_viridis_b(
ggplot2::scale_fill_viridis_c(
trans = "log",
# breaks = c(1, 2.5, 5, 10, 25, 50),
# labels = c("1", "2.5", "5", "10", "25", "50"),
breaks = breaks_log(n = 6, base = 2),
labels = label_comma(),
name = paste("Percentage\nby", a_title)
) +
#xlab("Predicted context") +
xlab("") +
#ylab("Predicted emotion") +
ylab("") +
coord_fixed(ratio = 1) +
ggplot2::theme_bw() +
ggplot2::theme(
axis.text.x =
element_text(
angle = 45,
vjust = 1,
hjust = 1
)
) +
xlim(top_perc %>% pull(.data[[a]]) %>% unique()) +
ylim(top_perc %>% pull(.data[[b]]) %>% unique())
# xlim(data %>% count(.data[[a]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[a]]) %>% unique()) +
# ylim(data %>% count(.data[[b]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[b]]) %>% unique())
data_tile_plot_B <-
data %>%
count(.data[[a]], .data[[b]]) %>%
group_by(.data[[b]]) %>%
mutate(perc = (n/sum(n))*100) %>%
ungroup() %>%
ggplot(
aes(
x = ordered(
.data[[a]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
y = ordered(
.data[[b]] %>% as.character(),
levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
fill = perc
)
) +
geom_tile() +
# ggplot2::scale_fill_viridis_b(
ggplot2::scale_fill_viridis_c(
trans = "log",
# breaks = c(1, 2.5, 5, 10, 25, 50),
# labels = c("1", "2.5", "5", "10", "25", "50"),
breaks = breaks_log(n = 6, base = 2),
labels = label_comma(),
name = paste("Percentage\nby", b_title)
) +
#xlab("Predicted context") +
xlab("") +
#ylab("Predicted emotion") +
ylab("") +
coord_fixed(ratio = 1) +
ggplot2::theme_bw() +
ggplot2::theme(
axis.text.x =
element_text(
angle = 45,
vjust = 1,
hjust = 1
)
) +
xlim(top_perc %>% pull(.data[[a]]) %>% unique()) +
ylim(top_perc %>% pull(.data[[b]]) %>% unique())
# xlim(data %>% count(.data[[a]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[a]]) %>% unique()) +
# ylim(data %>% count(.data[[b]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[b]]) %>% unique())
data_tile_all <-
(data_tile_plot_main + theme(plot.margin = unit(c(0,0,0,0), "pt"))) +
(
(data_tile_plot_A + theme(plot.margin = unit(c(0,0,0,0), "pt"))) /
(data_tile_plot_B + theme(plot.margin = unit(c(0,0,0,0), "pt")))
)
data_tile_all
# ggsave(
# paste0(
# "../images/MMc-compare-classifications-tweets_",
# a, "_", b, ".pdf"
# ),
# width = 400, height = 200, units = "mm", limitsize = FALSE,
# data_tile_all
# )
}
tweet_classifications %>%
count(tweet_sentimentr_class, tweet_flair_sentiment_class) %>%
ggplot(
aes(
x = ordered(
tweet_sentimentr_class,
levels = c("Negative", "Neutral", "Positive")),
y = ordered(
tweet_flair_sentiment_class,
levels = c("Negative", "Neutral", "Positive")),
size = n,
colour = n
)
) +
geom_point() +
scale_size(name = "Count") +
scale_colour_viridis_b(
name = "Count"
) +
xlab("Sentimentr") +
ylab("Flair Sentiment") +
coord_fixed(ratio = 1) +
ggplot2::theme_bw() +
ggplot2::theme(
axis.text.x =
element_text(
angle = 45,
vjust = 1,
hjust = 1
)
)
Names of the classes are shorten to make the visualisations more readable.
tweet_classifications_to_plot <-
tweet_classifications %>%
mutate(
across(
c(
tweet_flair_e6c11m2_top_emotion,
tweet_flair_c6c12m1_top_context,
btm200bg_topic_sum_b,
trans_umap_hdbscan
),
~str_trunc(.x, 15)
)
)
tweet_classifications_to_plot %>%
match_and_plot(
"tweet_flair_e6c11m2_top_emotion", "Emotion",
"tweet_flair_c6c12m1_top_context", "Context"
)
tweet_classifications_to_plot %>%
# Filter-out massive-reply cluster
filter(!(trans_umap_hdbscan %in% c("0"))) %>%
# Plot
match_and_plot_top(
"btm200bg_topic_sum_b", "BTM topics",
"trans_umap_hdbscan", "Trans.-based"
)
## Warning: Use of .data in tidyselect expressions was deprecated in tidyselect 1.2.0.
## ℹ Please use `all_of(var)` (or `any_of(var)`) instead of `.data[[var]]`
## Joining, by = "btm200bg_topic_sum_b"
## Joining, by = "trans_umap_hdbscan"
## Warning: Removed 66007 rows containing missing values (`geom_tile()`).
## Warning: Removed 66007 rows containing missing values (`geom_tile()`).
## Removed 66007 rows containing missing values (`geom_tile()`).
tweet_classifications_to_plot %>%
# Plot
match_and_plot_top(
"tweet_flair_e6c11m2_top_emotion", "Emotion",
"btm200bg_topic_sum_b", "BTM topics"
)
## Joining, by = "tweet_flair_e6c11m2_top_emotion"
## Joining, by = "btm200bg_topic_sum_b"
## Warning: Removed 1504 rows containing missing values (`geom_tile()`).
## Removed 1504 rows containing missing values (`geom_tile()`).
## Removed 1504 rows containing missing values (`geom_tile()`).
tweet_classifications_to_plot %>%
# Plot
match_and_plot_top(
"tweet_flair_c6c12m1_top_context", "Context",
"btm200bg_topic_sum_b", "BTM topics"
)
## Joining, by = "tweet_flair_c6c12m1_top_context"
## Joining, by = "btm200bg_topic_sum_b"
## Warning: Removed 1581 rows containing missing values (`geom_tile()`).
## Removed 1581 rows containing missing values (`geom_tile()`).
## Removed 1581 rows containing missing values (`geom_tile()`).
tweet_classifications_to_plot %>%
# Filter-out massive-reply cluster
filter(!(trans_umap_hdbscan %in% c("0"))) %>%
# Plot
match_and_plot_top(
"tweet_flair_e6c11m2_top_emotion", "Emotion",
"trans_umap_hdbscan", "Trans.-based"
)
## Joining, by = "tweet_flair_e6c11m2_top_emotion"
## Joining, by = "trans_umap_hdbscan"
## Warning: Removed 10081 rows containing missing values (`geom_tile()`).
## Removed 10081 rows containing missing values (`geom_tile()`).
## Removed 10081 rows containing missing values (`geom_tile()`).
tweet_classifications_to_plot %>%
# Filter-out massive-reply cluster
filter(!(trans_umap_hdbscan %in% c("0"))) %>%
# Plot
match_and_plot_top(
"tweet_flair_c6c12m1_top_context", "Context",
"trans_umap_hdbscan", "Trans.-based"
)
## Joining, by = "tweet_flair_c6c12m1_top_context"
## Joining, by = "trans_umap_hdbscan"
## Warning: Removed 10143 rows containing missing values (`geom_tile()`).
## Removed 10143 rows containing missing values (`geom_tile()`).
## Removed 10143 rows containing missing values (`geom_tile()`).
tweet_classifications_to_plot %>%
# Filter-out massive-reply cluster
filter(!(trans_umap_hdbscan %in% c("0"))) %>%
# Plot
mutate(
emotion_context = paste0(
str_trunc(tweet_flair_e6c11m2_top_emotion, 5, ellipsis = ""),
"+",
str_trunc(tweet_flair_c6c12m1_top_context, 8, ellipsis = "")
)
) %>%
match_and_plot_top(
"emotion_context", "Emotion + Context",
"btm200bg_topic_sum_b", "BTM topics"
)
## Joining, by = "emotion_context"
## Joining, by = "btm200bg_topic_sum_b"
## Warning: Removed 8929 rows containing missing values (`geom_tile()`).
## Removed 8929 rows containing missing values (`geom_tile()`).
## Removed 8929 rows containing missing values (`geom_tile()`).
tweet_classifications_to_plot %>%
# Filter-out massive-reply cluster
filter(!(trans_umap_hdbscan %in% c("0"))) %>%
# Plot
mutate(
emotion_context = paste0(
str_trunc(tweet_flair_e6c11m2_top_emotion, 5, ellipsis = ""),
"+",
str_trunc(tweet_flair_c6c12m1_top_context, 8, ellipsis = "")
)
) %>%
match_and_plot_top(
"emotion_context", "Emotion + Context",
"trans_umap_hdbscan", "Trans.-based"
)
## Joining, by = "emotion_context"
## Joining, by = "trans_umap_hdbscan"
## Warning: Removed 28856 rows containing missing values (`geom_tile()`).
## Removed 28856 rows containing missing values (`geom_tile()`).
## Removed 28856 rows containing missing values (`geom_tile()`).
tweet_classifications_to_plot %>%
# Filter-out massive-reply cluster
filter(!(trans_umap_hdbscan %in% c("0"))) %>%
# Select clusters
right_join(
tweet_classifications_to_plot %>%
group_by(trans_umap_hdbscan) %>%
summarise(
n = n(),
mean_trans_umap_hdbscan_prob = mean(trans_umap_hdbscan_prob),
sd_trans_umap_hdbscan_prob = sd(trans_umap_hdbscan_prob)
) %>%
ungroup() %>%
filter(n > 100 & sd_trans_umap_hdbscan_prob < 0.01) %>%
slice_max(order_by = mean_trans_umap_hdbscan_prob, n = 20, with_ties = FALSE) %>%
select(trans_umap_hdbscan)
) %>%
# Plot
match_and_plot(
"tweet_flair_e6c11m2_top_emotion", "Emotion",
"trans_umap_hdbscan", "Trans.-based"
)
## Joining, by = "trans_umap_hdbscan"
tweet_classifications_to_plot %>%
# Filter-out massive-reply cluster
filter(!(trans_umap_hdbscan %in% c("0"))) %>%
# Select clusters
right_join(
tweet_classifications_to_plot %>%
group_by(trans_umap_hdbscan) %>%
summarise(
n = n(),
mean_trans_umap_hdbscan_prob = mean(trans_umap_hdbscan_prob),
sd_trans_umap_hdbscan_prob = sd(trans_umap_hdbscan_prob)
) %>%
ungroup() %>%
filter(n > 100 & sd_trans_umap_hdbscan_prob < 0.01) %>%
slice_max(order_by = mean_trans_umap_hdbscan_prob, n = 20, with_ties = FALSE) %>%
select(trans_umap_hdbscan)
) %>%
# Plot
match_and_plot(
"tweet_flair_c6c12m1_top_context", "Context",
"trans_umap_hdbscan", "Trans.-based"
)
## Joining, by = "trans_umap_hdbscan"
tweet_classifications_to_plot %>%
# Filter-out massive-reply cluster
filter(!(trans_umap_hdbscan %in% c("0"))) %>%
# Select clusters
right_join(
tweet_classifications_to_plot %>%
group_by(trans_umap_hdbscan) %>%
summarise(
n = n(),
mean_trans_umap_hdbscan_prob = mean(trans_umap_hdbscan_prob),
sd_trans_umap_hdbscan_prob = sd(trans_umap_hdbscan_prob)
) %>%
ungroup() %>%
filter(n > 100 & sd_trans_umap_hdbscan_prob < 0.01) %>%
slice_max(order_by = mean_trans_umap_hdbscan_prob, n = 20, with_ties = FALSE) %>%
select(trans_umap_hdbscan)
) %>%
# Plot
match_and_plot(
"tweet_flair_c6c12m1_top_context", "Context",
"trans_umap_hdbscan", "Trans.-based"
)
## Joining, by = "trans_umap_hdbscan"
The plots below illustrate the relationship between different classifications in a area of the 2-dimensional space creted through the UMAP dimensionality reduction process.
The following code defines the colour palettes used in the plots below, based on the colour-blind safe palettes developed by Paul Tol.
# Grey colour from the Vibrant qualitative palette by Paul Tol
# https://personal.sron.nl/~pault/
palette_PaulTol_VibrantGrey <- "#BBBBBB"
# High-contrast qualitative colour palette by Paul Tol
# https://personal.sron.nl/~pault/
palette_PaulTol_RYB <- c("#BB5566", "#DDAA33", "#004488")
# Muted qualitative colour palette by Paul Tol
# https://personal.sron.nl/~pault/
palette_PaulTol_Muted <- c("#CC6677", "#332288", "#DDCC77", "#117733", "#88CCEE", "#882255", "#44AA99", "#999933", palette_PaulTol_VibrantGrey)
# Light qualitative colour palette by Paul Tol
# https://personal.sron.nl/~pault/
palette_PaulTol_Light <- c("#77AADD", "#EE8866", "#EEDD88", "#AAAA00", "#99DDFF", "#44BB99", "#BBCC33", "#FFAABB", "#DDDDDD")
umap_plane_sentimentr <-
tweet_classifications %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
#colour = tweet_sentimentr_sentiment
colour = tweet_sentimentr_class
)) +
geom_point(size = 0.01) +
# scale_color_distiller(
# name = "Sentimentr (plot a)",
# palette = "RdYlBu",
# direction = 1,
# limits = c(-1, 1),
# breaks = c(-1, -0.5, 0, 0.5, 1),
# label = c("<= -1", "-0.5", "0", "0.5", ">= 1"),
# oob = squish
# ) +
# scale_colour_brewer(
# palette = "RdYlBu"
# ) +
scale_colour_manual(
values = palette_PaulTol_RYB
) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(0, 4) + ylim(-3, 1) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("Sentimentr") +
theme_bw() +
theme(legend.position = "none")
umap_plane_flair_sentiment <-
tweet_classifications %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
#colour = tweet_flair_sentiment_value
colour = tweet_flair_sentiment_class
)) +
geom_point(size = 0.01) +
# scale_colour_brewer(
# palette = "Set3"
# ) +
# scale_colour_brewer(
# palette = "RdYlBu"
# ) +
scale_colour_manual(
values = palette_PaulTol_RYB
) +
guides(colour = guide_legend(
"Sentiment (plot a and b)",
override.aes = list(size=5)
)) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(0, 4) + ylim(-3, 1) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("Flair sentiment") +
theme_bw()
#+ theme(legend.position = "none")
umap_plane_btm <-
tweet_classifications %>%
mutate(
btm200bg_topic_sum_b = na_if(btm200bg_topic_sum_b, -1)
) %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
colour = as_factor(as.character(btm200bg_topic_sum_b))
)) +
geom_point(size = 0.01) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(0, 4) + ylim(-3, 1) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("BTM (repeated colours)") +
theme_bw() +
theme(legend.position = "none")
umap_plane_trans <-
tweet_classifications %>%
mutate(
trans_umap_hdbscan = na_if(trans_umap_hdbscan, -1)
) %>%
left_join(
tweet_classifications %>%
filter(
trans_umap_hdbscan %in% c("250", "251", "458", "456")
) %>%
group_by(trans_umap_hdbscan) %>%
# select the point closer to the
# left border of the plot to label
mutate(
dist_from_origin = sqrt(((trans_umap_comp1 + 1) ^ 2) + ((trans_umap_comp2 + 2) ^ 2))
) %>%
slice_min(
order_by = dist_from_origin,
n=1
) %>%
mutate(
plot_label = trans_umap_hdbscan
) %>%
select(tweet_id_str, plot_label)
) %>%
#replace_na(list(plot_label = "")) %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
colour = as_factor(as.character(trans_umap_hdbscan)),
label = plot_label
)) +
geom_point(size = 0.01) +
#geom_label(
geom_label_repel(
fill = "#FFFFFF",
colour = "#000000",
alpha = 0.7,
#size = 3,
force_pull = 0.5,
box.padding = 0,
max.overlaps = Inf,
segment.curvature = -0.1,
min.segment.length = 3
) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(0, 4) + ylim(-3, 1) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("Transformer-based (repeated colours)") +
#theme(legend.position = "none")
scale_colour_discrete(
breaks = c("250"),
labels = c("The colours used in plot c and\nd are randomised and repeated\nmultiple times for different cluster,\naiming to achieve a simple visual\ndiscernability of the clusters\nrather than full identification.")
) +
guides(colour = guide_legend(
"Topic (plot c and d)",
override.aes = list(alpha=0.0)
)) +
theme_bw()
## Adding missing grouping variables: `trans_umap_hdbscan`
## Joining, by = c("tweet_id_str", "trans_umap_hdbscan")
umap_plane_emotion <-
tweet_classifications %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
colour = tweet_flair_e6c11m2_top_emotion
)) +
geom_point(size = 0.01) +
# scale_colour_brewer(
# palette = "Set3"
# ) +
scale_colour_manual(
values = palette_PaulTol_Light
) +
guides(colour = guide_legend(
"Emotion (plot e)",
override.aes = list(size=5)
)) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(0, 4) + ylim(-3, 1) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("Emotion") +
theme_bw()
#+ theme(legend.position = "none")
umap_plane_context <-
tweet_classifications %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
colour = tweet_flair_c6c12m1_top_context
)) +
geom_point(size = 0.01) +
# scale_colour_brewer(
# palette = "Set1"
# ) +
scale_colour_manual(
values = palette_PaulTol_Muted
) +
guides(colour = guide_legend(
"Context (plot f)",
override.aes = list(size=5)
)) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(0, 4) + ylim(-3, 1) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("Context") +
theme_bw()
#+ theme(legend.position = "none")
(umap_plane_sentimentr + umap_plane_flair_sentiment) /
(umap_plane_btm + umap_plane_trans) /
(umap_plane_emotion + umap_plane_context) +
plot_annotation(
tag_levels = c("a")
#, caption = 'The colours used in plot c and b are randomised and repeated multiple times for different cluster, aiming\nto achieve a simple visual discernability of the different clusters rather than full identification.'
) +
plot_layout(
guides = 'collect'
)
## Warning: Removed 802815 rows containing missing values (`geom_point()`).
## Removed 802815 rows containing missing values (`geom_point()`).
## Removed 802815 rows containing missing values (`geom_point()`).
## Removed 802815 rows containing missing values (`geom_point()`).
## Warning: Removed 849984 rows containing missing values (`geom_label_repel()`).
## Warning: Removed 802815 rows containing missing values (`geom_point()`).
## Removed 802815 rows containing missing values (`geom_point()`).
umap_plane_sentimentr <-
tweet_classifications %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
#colour = tweet_sentimentr_sentiment
colour = tweet_sentimentr_class
)) +
geom_point(size = 0.01) +
# scale_color_distiller(
# name = "Sentimentr (plot a)",
# palette = "RdYlBu",
# direction = 1,
# limits = c(-1, 1),
# breaks = c(-1, -0.5, 0, 0.5, 1),
# label = c("<= -1", "-0.5", "0", "0.5", ">= 1"),
# oob = squish
# ) +
# scale_colour_brewer(
# palette = "RdYlBu"
# ) +
scale_colour_manual(
values = palette_PaulTol_RYB
) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(6, 10) + ylim(3, 7) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("Sentimentr") +
theme_bw() +
theme(legend.position = "none")
umap_plane_flair_sentiment <-
tweet_classifications %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
#colour = tweet_flair_sentiment_value
colour = tweet_flair_sentiment_class
)) +
geom_point(size = 0.01) +
# scale_colour_brewer(
# palette = "Set3"
# ) +
# scale_colour_brewer(
# palette = "RdYlBu"
# ) +
scale_colour_manual(
values = palette_PaulTol_RYB
) +
guides(colour = guide_legend(
"Sentiment (plot a and b)",
override.aes = list(size=5)
)) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(6, 10) + ylim(3, 7) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("Flair sentiment") +
theme_bw()
#+ theme(legend.position = "none")
umap_plane_btm <-
tweet_classifications %>%
mutate(
btm200bg_topic_sum_b = na_if(btm200bg_topic_sum_b, -1)
) %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
colour = as_factor(as.character(btm200bg_topic_sum_b))
)) +
geom_point(size = 0.01) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(6, 10) + ylim(3, 7) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("BTM (repeated colours)") +
theme_bw() +
theme(legend.position = "none")
umap_plane_trans <-
tweet_classifications %>%
mutate(
trans_umap_hdbscan = na_if(trans_umap_hdbscan, -1)
) %>%
left_join(
tweet_classifications %>%
filter(
trans_umap_hdbscan %in% c("1016", "543", "627", "746", "869")
) %>%
group_by(trans_umap_hdbscan) %>%
# select the point closer to the
# left border of the plot to label
mutate(
dist_from_origin = sqrt(((trans_umap_comp1 - 6) ^ 2) + ((trans_umap_comp2 - 7) ^ 2))
) %>%
slice_min(
order_by = dist_from_origin,
n = 1
) %>%
mutate(
plot_label = trans_umap_hdbscan
) %>%
select(tweet_id_str, plot_label)
) %>%
#replace_na(list(plot_label = "")) %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
colour = as_factor(as.character(trans_umap_hdbscan)),
label = plot_label
)) +
geom_point(size = 0.01) +
#geom_label(
geom_label_repel(
fill = "#FFFFFF",
colour = "#000000",
alpha = 0.7,
#size = 3,
force_pull = 0.5,
box.padding = 0,
max.overlaps = Inf,
segment.curvature = -0.1,
min.segment.length = 0
) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(6, 10) + ylim(3, 7) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("Transformer-based (repeated colours)") +
theme_bw() +
theme(legend.position = "none")
## Adding missing grouping variables: `trans_umap_hdbscan`
## Joining, by = c("tweet_id_str", "trans_umap_hdbscan")
umap_plane_emotion <-
tweet_classifications %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
colour = tweet_flair_e6c11m2_top_emotion
)) +
geom_point(size = 0.01) +
# scale_colour_brewer(
# palette = "Set3"
# ) +
scale_colour_manual(
values = palette_PaulTol_Light
) +
guides(colour = guide_legend(
"Emotion (plot e)",
override.aes = list(size=5)
)) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(6, 10) + ylim(3, 7) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("Emotion") +
theme_bw()
#+ theme(legend.position = "none")
umap_plane_context <-
tweet_classifications %>%
ggplot(aes(
x = trans_umap_comp1,
y = trans_umap_comp2,
colour = tweet_flair_c6c12m1_top_context
)) +
geom_point(size = 0.01) +
# scale_colour_brewer(
# palette = "Set1"
# ) +
scale_colour_manual(
values = palette_PaulTol_Muted
) +
guides(colour = guide_legend(
"Context (plot f)",
override.aes = list(size=5)
)) +
#xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
xlim(6, 10) + ylim(3, 7) +
xlab("Transformer-UMAP comp. 1") +
ylab("Transformer-UMAP comp. 2") +
coord_fixed(ratio = 1) +
ggtitle("Context") +
theme_bw()
#+ theme(legend.position = "none")
(umap_plane_sentimentr + umap_plane_flair_sentiment) /
(umap_plane_btm + umap_plane_trans) /
(umap_plane_emotion + umap_plane_context) +
plot_annotation(
tag_levels = c("a"),
caption = 'The colours used in plot c and b are randomised and repeated multiple times for different cluster, aiming\nto achieve a simple visual discernability of the different clusters rather than full identification.'
) +
plot_layout(
guides = 'collect'
)
## Warning: Removed 689060 rows containing missing values (`geom_point()`).
## Removed 689060 rows containing missing values (`geom_point()`).
## Removed 689060 rows containing missing values (`geom_point()`).
## Removed 689060 rows containing missing values (`geom_point()`).
## Warning: Removed 849983 rows containing missing values (`geom_label_repel()`).
## Warning: Removed 689060 rows containing missing values (`geom_point()`).
## Removed 689060 rows containing missing values (`geom_point()`).