library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.6 ✔ dplyr 1.0.8
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggthemes)
library(patchwork)
library(tidytext)
library(knitr)
tweets_clusters <-
read_csv(
"../storage/MMc_TwitterLeicester2018-2019_all-en_clr-emo-lemm-multi_no-spam_clr-noemo_embeddings-twitter-roberta-base_umap-2d_dbscans.csv",
col_types = list(
tweet_id_str = col_character(),
tweet_url = col_character(),
umap_comp1 = col_double(),
umap_comp2 = col_double(),
umap_dbscan_eps005 = col_double(),
umap_dbscan_eps006 = col_double(),
umap_dbscan_eps007 = col_double(),
umap_dbscan_eps008 = col_double(),
umap_dbscan_eps009 = col_double(),
umap_dbscan_eps010 = col_double(),
umap_dbscan_eps011 = col_double(),
umap_dbscan_eps012 = col_double(),
umap_dbscan_eps013 = col_double(),
umap_dbscan_eps014 = col_double(),
umap_dbscan_eps015 = col_double(),
umap_dbscan_eps016 = col_double(),
umap_dbscan_eps017 = col_double(),
umap_dbscan_eps018 = col_double(),
umap_dbscan_eps019 = col_double(),
umap_dbscan_eps020 = col_double(),
umap_dbscan_eps021 = col_double(),
umap_dbscan_eps022 = col_double(),
umap_dbscan_eps023 = col_double(),
umap_dbscan_eps024 = col_double(),
umap_dbscan_eps025 = col_double(),
umap_hdbscan = col_double(),
umap_hdbscan_prob = col_double(),
umap_hdbscan_outlier_score = col_double()
)
)
for (i_eps in c(5, 10, 15, 20, 25)) {
eps_var <-
i_eps %>%
str_pad(width = 3, pad = "0") %>%
paste0("umap_dbscan_eps", .)
plot_largest_clusters <-
tweets_clusters %>%
right_join(
tweets_clusters %>%
filter(.data[[ eps_var ]] != -1) %>%
count(.data[[ eps_var ]]) %>%
ungroup() %>%
slice_max(order_by = n, n = 20)
) %>%
#count(.data[[ eps_var ]])
ggplot(aes(
x = umap_comp1,
y = umap_comp2,
colour = as_factor(.data[[ eps_var ]])
)) +
geom_point(size = 0.02) +
scale_colour_tableau(palette = "Tableau 20") +
guides(colour = "none") +
xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
coord_fixed(ratio = 1) +
ggtitle("20 largest clusters")
plot_other_clusters <-
tweets_clusters %>%
anti_join(
tweets_clusters %>%
filter(.data[[ eps_var ]] != -1) %>%
count(.data[[ eps_var ]]) %>%
ungroup() %>%
slice_max(order_by = n, n = 20)
) %>%
filter(.data[[ eps_var ]] != -1) %>%
#count(.data[[ eps_var ]])
ggplot(aes(
x = umap_comp1,
y = umap_comp2,
colour = as_factor(as.character(.data[[ eps_var ]]))
)) +
geom_point(size = 0.02) +
guides(colour = "none") +
xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
coord_fixed(ratio = 1) +
ggtitle("All other clusters (same colour used multiple times for different clusters)")
plot_no_clusters <-
tweets_clusters %>%
filter(.data[[ eps_var ]] == -1) %>%
#count(.data[[ eps_var ]])
ggplot(aes(
x = umap_comp1,
y = umap_comp2
)) +
geom_point(size = 0.02) +
xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
coord_fixed(ratio = 1) +
ggtitle("Unclustered data")
plots_all_clusters <-
plot_largest_clusters + plot_other_clusters + plot_no_clusters +
plot_annotation(
title = paste0('Clustering using eps=0.0', i_eps)
)
print(plots_all_clusters)
}
## Joining, by = "umap_dbscan_eps005"
## Joining, by = "umap_dbscan_eps005"
## Joining, by = "umap_dbscan_eps010"
## Joining, by = "umap_dbscan_eps010"
## Joining, by = "umap_dbscan_eps015"
## Joining, by = "umap_dbscan_eps015"
## Joining, by = "umap_dbscan_eps020"
## Joining, by = "umap_dbscan_eps020"
## Joining, by = "umap_dbscan_eps025"
## Joining, by = "umap_dbscan_eps025"
cluster_sizes_to_plot <-
tweets_clusters %>%
filter(umap_hdbscan != -1) %>%
count(umap_hdbscan, sort = TRUE) %>%
ungroup() %>%
rownames_to_column() %>%
mutate(rowname = as.numeric(rowname))
cluster_sizes_at_20 <-
cluster_sizes_to_plot %>%
filter(rowname == 20) %>%
pull(n)
cluster_sizes_at_200 <-
cluster_sizes_to_plot %>%
filter(rowname == 200) %>%
pull(n)
plot_cluster_size_power <-
cluster_sizes_to_plot %>%
ggplot(aes(x = rowname, y = n)) +
geom_point(size = 0.1) +
xlab("Cluster order\n") +
ylab("Cluster size") +
scale_x_log10() +
scale_y_log10() +
geom_line(
data = data.frame(
x = c(20, 20, 1000000),
y = c(0.001, cluster_sizes_at_20, cluster_sizes_at_20)
),
aes(x = x, y = y),
colour = "#1b9e77"
) +
geom_line(
data = data.frame(
x = c(200, 200, 1000000),
y = c(0.001, cluster_sizes_at_200, cluster_sizes_at_200)
),
aes(x = x, y = y),
colour = "#d95f02"
) +
coord_cartesian(
xlim = c(
0.999,
cluster_sizes_to_plot %>% nrow() %>% `*`(1.001)
),
ylim = c(
cluster_sizes_to_plot %>% pull(n) %>% min() %>% `*`(0.999),
cluster_sizes_to_plot %>% pull(n) %>% max() %>% `*`(1.001)
)
) +
ggtitle("Log-log plot") +
theme(plot.title = element_text(size=8))
plot_cluster_size_elbow <-
cluster_sizes_to_plot %>%
ggplot(aes(x = rowname, y = n)) +
geom_point(size = 0.1) +
xlab("Cluster order\n") +
ylab("Cluster size") +
scale_y_log10() +
geom_line(
data = data.frame(
x = c(20, 20, 1000000),
y = c(0.001, cluster_sizes_at_20, cluster_sizes_at_20)
),
aes(x = x, y = y),
colour = "#1b9e77"
) +
geom_line(
data = data.frame(
x = c(200, 200, 1000000),
y = c(0.001, cluster_sizes_at_200, cluster_sizes_at_200)
),
aes(x = x, y = y),
colour = "#d95f02"
) +
coord_cartesian(
xlim = c(
0.999,
cluster_sizes_to_plot %>% nrow() %>% `*`(1.001)
),
ylim = c(
cluster_sizes_to_plot %>% pull(n) %>% min() %>% `*`(0.999),
cluster_sizes_to_plot %>% pull(n) %>% max() %>% `*`(1.001)
)
) +
ggtitle("Log-linear plot") +
theme(plot.title = element_text(size=8))
plot_cluster_size_prob <-
tweets_clusters %>%
filter(umap_hdbscan != -1) %>%
group_by(umap_hdbscan) %>%
summarise(
mean_umap_hdbscan_prob = mean(umap_hdbscan_prob),
mean_umap_hdbscan_outlier_score = mean(umap_hdbscan_outlier_score),
n = n()
) %>%
ungroup() %>%
ggplot(aes(x = mean_umap_hdbscan_prob, y = n)) +
geom_point(size = 0.1) +
scale_y_log10() +
geom_hline(
yintercept = cluster_sizes_at_20,
colour = "#1b9e77"
) +
geom_hline(
yintercept = cluster_sizes_at_200,
colour = "#d95f02"
) +
xlab("Mean membership probability score\n") +
ylab("Cluster size")
ggtitle("Cluster size and coherence") +
theme(plot.title = element_text(size=8))
## NULL
plot_largest_20_clusters <-
tweets_clusters %>%
right_join(
tweets_clusters %>%
filter(umap_hdbscan != -1) %>%
count(umap_hdbscan) %>%
ungroup() %>%
slice_max(order_by = n, n = 20)
) %>%
#count(umap_hdbscan)
ggplot(aes(
x = umap_comp1,
y = umap_comp2,
colour = as_factor(as.character(umap_hdbscan))
)) +
geom_point(size = 0.02) +
scale_colour_tableau(palette = "Tableau 20") +
guides(colour = guide_legend(
"Top 20 largest\nclusters (plot d)",
override.aes = list(size=5)
)) +
xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
coord_fixed(ratio = 1) +
ggtitle("Top 20 largest clusters")
## Joining, by = "umap_hdbscan"
plot_largest_200_clusters <-
tweets_clusters %>%
right_join(
tweets_clusters %>%
filter(umap_hdbscan != -1) %>%
count(umap_hdbscan) %>%
ungroup() %>%
slice_max(order_by = n, n = 200) %>%
slice_min(order_by = n, n = 180)
) %>%
#count(umap_hdbscan)
ggplot(aes(
x = umap_comp1,
y = umap_comp2,
colour = as_factor(as.character(umap_hdbscan))
)) +
geom_point(size = 0.02) +
#scale_colour_brewer(palette = "Set 1") +
#scale_colour_tableau(palette = "Tableau 20") +
guides(colour = "none") +
xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
coord_fixed(ratio = 1) +
ggtitle("Next 180 largest clusters")
## Joining, by = "umap_hdbscan"
plot_other_clusters <-
tweets_clusters %>%
anti_join(
tweets_clusters %>%
filter(umap_hdbscan != -1) %>%
count(umap_hdbscan) %>%
ungroup() %>%
slice_max(order_by = n, n = 200)
) %>%
filter(umap_hdbscan != -1) %>%
#count(umap_hdbscan)
ggplot(aes(
x = umap_comp1,
y = umap_comp2,
colour = as_factor(as.character(umap_hdbscan))
)) +
geom_point(size = 0.02) +
guides(colour = "none") +
xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
coord_fixed(ratio = 1) +
ggtitle("All other clusters")
## Joining, by = "umap_hdbscan"
plot_no_clusters <-
tweets_clusters %>%
filter(umap_hdbscan == -1) %>%
#count(umap_hdbscan)
ggplot(aes(
x = umap_comp1,
y = umap_comp2
)) +
geom_point(size = 0.02) +
xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
coord_fixed(ratio = 1) +
#coord_cartesian(clip="off") +
#geom_rangeframe() +
ggtitle("Unclustered tweets")
plots_all_clusters_layout <-
c(
area(1, 1, 2, 2),
area(1, 3, 2, 4),
area(1, 5, 2, 6),
area(3, 1, 5, 3),
area(3, 4, 5, 6),
area(6, 1, 8, 3),
area(6, 4, 8, 6)
)
plots_all_clusters <-
(
# (plot_cluster_size_power + plot_cluster_size_elbow + plot_cluster_size_prob) /
# (plot_largest_20_clusters + plot_largest_200_clusters) /
# (plot_other_clusters + plot_no_clusters)
plot_cluster_size_power + plot_cluster_size_elbow + plot_cluster_size_prob +
plot_largest_20_clusters + plot_largest_200_clusters +
plot_other_clusters + plot_no_clusters
) +
plot_annotation(
tag_levels = c("a"),
title = paste0('HDBSCAN clustering results'),
caption = 'The colours used in plot e and f are randomised and repeated multiple times for different cluster, aiming to achieve a simple visual discernability of the different clusters rather than full identification.'
) +
plot_layout(
design = plots_all_clusters_layout,
guides = 'collect'
) &
theme_bw() #+
#theme_tufte() +
#theme(legend.position='left')
print(plots_all_clusters)
rm(plot_largest_clusters, plot_other_clusters, plot_no_clusters, plots_all_clusters)
# tweets_clusters %>%
# filter(umap_hdbscan != -1) %>%
# group_by(umap_hdbscan) %>%
# summarise(
# umap_hdbscan_prob = mean(umap_hdbscan_prob),
# umap_hdbscan_outlier_score = mean(umap_hdbscan_outlier_score),
# n = n()
# ) %>%
# View()
tweets_clusters %>%
filter(umap_hdbscan != -1) %>%
ggplot(aes(
x = umap_hdbscan_prob
)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
tweets_clusters %>%
filter(umap_hdbscan != -1) %>%
ggplot(aes(
x = umap_hdbscan_outlier_score
)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
tweets_clusters %>%
filter(umap_hdbscan != -1) %>%
group_by(umap_hdbscan) %>%
summarise(
umap_hdbscan_prob = mean(umap_hdbscan_prob),
umap_hdbscan_outlier_score = mean(umap_hdbscan_outlier_score),
n = n()
) %>%
filter(n > 500) %>%
ggplot(aes(
x = umap_hdbscan_prob,
y = umap_hdbscan_outlier_score,
size = n
)) +
geom_point()
tweets_clusters %>%
filter(umap_hdbscan != -1) %>%
group_by(umap_hdbscan) %>%
summarise(
umap_hdbscan_prob = mean(umap_hdbscan_prob),
umap_hdbscan_outlier_score = mean(umap_hdbscan_outlier_score),
n = n()
) %>%
ggplot(aes(
x = n,
y = umap_hdbscan_prob
)) +
geom_point(size = 0.1) +
scale_x_log10()
Simple count along with tf-idf was used to assess the words most
likely to characterise each cluster. The tweet_text_clr_emo
version of the text was used, which is the same adopted for topic
modelling.
tweets_corpus <-
read_csv("../storage/MMc_TwitterLeicester2018-2019_all-en_clr-emo-lemm-multi_no-spam_with-sentimentr.csv") %>%
mutate(tweet_id_str = str_remove(tweet_url, ".*status\\/"))
## Rows: 849988 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): user_url, user_name, user_screen_name, tweet_url, tweet_media_url...
## dbl (4): tweet_id_str, tweet_geo_long, tweet_geo_lat, tweet_sentimentr_sen...
## dttm (1): timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Check IDs
(tweets_corpus %>% nrow()) == (tweets_corpus %>% pull(tweet_id_str) %>% unique() %>% length())
## [1] TRUE
tweets_clusters_umap_hdbscan_with_text <-
tweets_corpus %>%
select(tweet_url, tweet_text_clr_emo) %>%
right_join(
tweets_clusters %>%
select(tweet_url, umap_hdbscan) %>%
filter(umap_hdbscan != -1)
)
## Joining, by = "tweet_url"
tweets_clusters_umap_hdbscan_with_words <-
tweets_clusters_umap_hdbscan_with_text %>%
select(umap_hdbscan, tweet_text_clr_emo) %>%
unnest_tokens(word, tweet_text_clr_emo) %>%
anti_join(stop_words) %>%
filter(!(
word %in% c("email", "website", "username", "date", "time", "money", "amp", "leicester")
)) %>%
count(umap_hdbscan, word, sort = TRUE)
## Joining, by = "word"
tweets_clusters_umap_hdbscan_total_words <-
tweets_clusters_umap_hdbscan_with_words %>%
group_by(umap_hdbscan) %>%
summarize(total = sum(n))
tweets_clusters_umap_hdbscan_with_words <-
tweets_clusters_umap_hdbscan_with_words %>%
left_join(tweets_clusters_umap_hdbscan_total_words)
## Joining, by = "umap_hdbscan"
tweets_clusters_umap_hdbscan_tf_idf <-
tweets_clusters_umap_hdbscan_with_words %>%
bind_tf_idf(word, umap_hdbscan, n)
tweets_clusters_umap_hdbscan_tf_idf %>%
group_by(umap_hdbscan) %>%
slice_max(order_by = tf_idf, n = 10) %>%
ungroup() %>%
select(umap_hdbscan, word) %>%
group_by(umap_hdbscan) %>%
summarise(words = paste0(word, collapse = ", ")) %>%
left_join(
tweets_clusters %>%
group_by(umap_hdbscan) %>%
summarise(
mean_umap_hdbscan_prob = mean(umap_hdbscan_prob),
sd_umap_hdbscan_prob = sd(umap_hdbscan_prob),
mean_umap_hdbscan_outlier_score = mean(umap_hdbscan_outlier_score),
n_of_tweets = n()
)
) %>%
arrange(-n_of_tweets)
## Joining, by = "umap_hdbscan"
## # A tibble: 1,739 × 6
## umap_hdbscan words mean_umap_hdbsc… sd_umap_hdbscan… mean_umap_hdbsc…
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1016 laughing, pe… 0.999 0.00348 0.0123
## 2 906 question, fu… 0.999 0.00585 0.0207
## 3 543 lcfc, league… 0.994 0.0230 0.0236
## 4 765 sleep, tired… 0.997 0.0123 0.0208
## 5 458 laughing, lo… 0.975 0.0633 0.0630
## 6 362 bro, mate, c… 0.982 0.0582 0.107
## 7 504 agree, peopl… 0.958 0.0833 0.102
## 8 346 laughing, lo… 0.965 0.116 0.0963
## 9 802 liverpool, l… 0.996 0.0127 0.0210
## 10 984 hope, xx, co… 0.996 0.0109 0.0208
## # … with 1,729 more rows, and 1 more variable: n_of_tweets <int>
#%>%
#kable()
tweets_clusters_tfidf <-
tweets_clusters %>%
select(tweet_id_str:umap_comp2,umap_hdbscan:umap_hdbscan_outlier_score) %>%
left_join(
tweets_clusters_umap_hdbscan_tf_idf %>%
group_by(umap_hdbscan) %>%
slice_max(order_by = tf_idf, n = 10) %>%
ungroup() %>%
select(umap_hdbscan, word) %>%
group_by(umap_hdbscan) %>%
summarise(umap_hdbscan_tfidf10 = paste0(word, collapse = " + "))
)
## Joining, by = "umap_hdbscan"
tweets_clusters_tfidf %>%
write_csv("../storage/MMc_TwitterLeicester2018-2019_all-en_clr-emo-lemm-multi_no-spam_clr-noemo_embeddings-twitter-roberta-base_umap-2d_dbscans_tfidf.csv")