library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggthemes)
library(patchwork)
library(tidytext)
library(knitr)
tweets_clusters <- 
  read_csv(
    "../storage/MMc_TwitterLeicester2018-2019_all-en_clr-emo-lemm-multi_no-spam_clr-noemo_embeddings-twitter-roberta-base_umap-2d_dbscans.csv",
    col_types = list(
      tweet_id_str = col_character(),
      tweet_url = col_character(),
      umap_comp1 = col_double(),
      umap_comp2 = col_double(),
      umap_dbscan_eps005 = col_double(),
      umap_dbscan_eps006 = col_double(),
      umap_dbscan_eps007 = col_double(),
      umap_dbscan_eps008 = col_double(),
      umap_dbscan_eps009 = col_double(),
      umap_dbscan_eps010 = col_double(),
      umap_dbscan_eps011 = col_double(),
      umap_dbscan_eps012 = col_double(),
      umap_dbscan_eps013 = col_double(),
      umap_dbscan_eps014 = col_double(),
      umap_dbscan_eps015 = col_double(),
      umap_dbscan_eps016 = col_double(),
      umap_dbscan_eps017 = col_double(),
      umap_dbscan_eps018 = col_double(),
      umap_dbscan_eps019 = col_double(),
      umap_dbscan_eps020 = col_double(),
      umap_dbscan_eps021 = col_double(),
      umap_dbscan_eps022 = col_double(),
      umap_dbscan_eps023 = col_double(),
      umap_dbscan_eps024 = col_double(),
      umap_dbscan_eps025 = col_double(),
      umap_hdbscan = col_double(),
      umap_hdbscan_prob = col_double(),
      umap_hdbscan_outlier_score = col_double()
    )
  )
for (i_eps in c(5, 10, 15, 20, 25)) {
  
  eps_var <- 
    i_eps %>% 
    str_pad(width = 3, pad = "0") %>% 
    paste0("umap_dbscan_eps", .)

  plot_largest_clusters <-
    tweets_clusters %>%
    right_join(
      tweets_clusters %>% 
        filter(.data[[ eps_var ]] != -1) %>% 
        count(.data[[ eps_var ]]) %>% 
        ungroup() %>% 
        slice_max(order_by = n, n = 20)
    ) %>%
    #count(.data[[ eps_var ]])
    ggplot(aes(
      x = umap_comp1,
      y = umap_comp2,
      colour = as_factor(.data[[ eps_var ]])
    )) +
    geom_point(size = 0.02) +
    scale_colour_tableau(palette = "Tableau 20") +
    guides(colour = "none") +
    xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
    coord_fixed(ratio = 1) +
    ggtitle("20 largest clusters")
  
  plot_other_clusters <-
    tweets_clusters %>%
    anti_join(
      tweets_clusters %>% 
        filter(.data[[ eps_var ]] != -1) %>% 
        count(.data[[ eps_var ]]) %>% 
        ungroup() %>% 
        slice_max(order_by = n, n = 20)
    ) %>%
    filter(.data[[ eps_var ]] != -1) %>% 
    #count(.data[[ eps_var ]])
    ggplot(aes(
      x = umap_comp1,
      y = umap_comp2,
      colour = as_factor(as.character(.data[[ eps_var ]]))
    )) +
    geom_point(size = 0.02) +
    guides(colour = "none") +
    xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
    coord_fixed(ratio = 1) +
    ggtitle("All other clusters (same colour used multiple times for different clusters)")
  
  plot_no_clusters <-
    tweets_clusters %>%
    filter(.data[[ eps_var ]] == -1) %>% 
    #count(.data[[ eps_var ]])
    ggplot(aes(
      x = umap_comp1,
      y = umap_comp2
    )) +
    geom_point(size = 0.02) +
    xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
    coord_fixed(ratio = 1) +
    ggtitle("Unclustered data")
  
  plots_all_clusters <-
    plot_largest_clusters + plot_other_clusters + plot_no_clusters + 
    plot_annotation(
      title = paste0('Clustering using eps=0.0', i_eps)
    )
  
  print(plots_all_clusters)

}
## Joining, by = "umap_dbscan_eps005"
## Joining, by = "umap_dbscan_eps005"
## Joining, by = "umap_dbscan_eps010"
## Joining, by = "umap_dbscan_eps010"

## Joining, by = "umap_dbscan_eps015"
## Joining, by = "umap_dbscan_eps015"

## Joining, by = "umap_dbscan_eps020"
## Joining, by = "umap_dbscan_eps020"

## Joining, by = "umap_dbscan_eps025"
## Joining, by = "umap_dbscan_eps025"

cluster_sizes_to_plot <-
  tweets_clusters %>% 
  filter(umap_hdbscan != -1) %>% 
  count(umap_hdbscan, sort = TRUE) %>% 
  ungroup() %>% 
  rownames_to_column() %>% 
  mutate(rowname = as.numeric(rowname))

cluster_sizes_at_20 <-
  cluster_sizes_to_plot %>% 
  filter(rowname == 20) %>% 
  pull(n)

cluster_sizes_at_200 <-
  cluster_sizes_to_plot %>% 
  filter(rowname == 200) %>% 
  pull(n)

plot_cluster_size_power <-
  cluster_sizes_to_plot %>% 
  ggplot(aes(x = rowname, y = n)) + 
  geom_point(size = 0.1) + 
  xlab("Cluster order\n") +
  ylab("Cluster size") +
  scale_x_log10() + 
  scale_y_log10() +
  geom_line(
    data = data.frame(
      x = c(20, 20, 1000000),
      y = c(0.001, cluster_sizes_at_20, cluster_sizes_at_20)
    ),
    aes(x = x, y = y),
    colour = "#1b9e77"
  ) +
  geom_line(
    data = data.frame(
      x = c(200, 200, 1000000),
      y = c(0.001, cluster_sizes_at_200, cluster_sizes_at_200)
    ),
    aes(x = x, y = y),
    colour = "#d95f02"
  ) +
  coord_cartesian(
    xlim = c(
      0.999, 
      cluster_sizes_to_plot %>% nrow() %>% `*`(1.001)
    ), 
    ylim = c(
      cluster_sizes_to_plot %>% pull(n) %>% min() %>% `*`(0.999),
      cluster_sizes_to_plot %>% pull(n) %>% max() %>% `*`(1.001)
    )
  ) +
  ggtitle("Log-log plot") + 
  theme(plot.title = element_text(size=8))

plot_cluster_size_elbow <-
  cluster_sizes_to_plot %>% 
  ggplot(aes(x = rowname, y = n)) + 
  geom_point(size = 0.1) + 
  xlab("Cluster order\n") +
  ylab("Cluster size") +
  scale_y_log10() +
  geom_line(
    data = data.frame(
      x = c(20, 20, 1000000),
      y = c(0.001, cluster_sizes_at_20, cluster_sizes_at_20)
    ),
    aes(x = x, y = y),
    colour = "#1b9e77"
  ) +
  geom_line(
    data = data.frame(
      x = c(200, 200, 1000000),
      y = c(0.001, cluster_sizes_at_200, cluster_sizes_at_200)
    ),
    aes(x = x, y = y),
    colour = "#d95f02"
  ) +
  coord_cartesian(
    xlim = c(
      0.999, 
      cluster_sizes_to_plot %>% nrow() %>% `*`(1.001)
    ), 
    ylim = c(
      cluster_sizes_to_plot %>% pull(n) %>% min() %>% `*`(0.999),
      cluster_sizes_to_plot %>% pull(n) %>% max() %>% `*`(1.001)
    )
  ) +
  ggtitle("Log-linear plot") + 
  theme(plot.title = element_text(size=8))

plot_cluster_size_prob <-
  tweets_clusters %>% 
  filter(umap_hdbscan != -1) %>% 
  group_by(umap_hdbscan) %>% 
  summarise(
    mean_umap_hdbscan_prob = mean(umap_hdbscan_prob), 
    mean_umap_hdbscan_outlier_score = mean(umap_hdbscan_outlier_score), 
    n = n()
  ) %>% 
  ungroup() %>% 
  ggplot(aes(x = mean_umap_hdbscan_prob, y = n)) + 
  geom_point(size = 0.1) + 
  scale_y_log10() +
  geom_hline(
    yintercept = cluster_sizes_at_20, 
    colour = "#1b9e77"
  ) +
  geom_hline(
    yintercept = cluster_sizes_at_200, 
    colour = "#d95f02"
  ) +
  xlab("Mean membership probability score\n") +
  ylab("Cluster size") 
  ggtitle("Cluster size and coherence") + 
  theme(plot.title = element_text(size=8))
## NULL
plot_largest_20_clusters <-
  tweets_clusters %>%
  right_join(
    tweets_clusters %>% 
      filter(umap_hdbscan != -1) %>% 
      count(umap_hdbscan) %>% 
      ungroup() %>% 
      slice_max(order_by = n, n = 20)
  ) %>%
  #count(umap_hdbscan)
  ggplot(aes(
    x = umap_comp1,
    y = umap_comp2,
    colour = as_factor(as.character(umap_hdbscan))
  )) +
  geom_point(size = 0.02) +
  scale_colour_tableau(palette = "Tableau 20") +
  guides(colour = guide_legend(
    "Top 20 largest\nclusters (plot d)",
    override.aes = list(size=5)
  )) +
  xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  coord_fixed(ratio = 1) +
  ggtitle("Top 20 largest clusters")
## Joining, by = "umap_hdbscan"
plot_largest_200_clusters <-
  tweets_clusters %>%
  right_join(
    tweets_clusters %>% 
      filter(umap_hdbscan != -1) %>% 
      count(umap_hdbscan) %>% 
      ungroup() %>% 
      slice_max(order_by = n, n = 200) %>% 
      slice_min(order_by = n, n = 180)
  ) %>%
  #count(umap_hdbscan)
  ggplot(aes(
    x = umap_comp1,
    y = umap_comp2,
    colour = as_factor(as.character(umap_hdbscan))
  )) +
  geom_point(size = 0.02) +
  #scale_colour_brewer(palette = "Set 1") +
  #scale_colour_tableau(palette = "Tableau 20") +
  guides(colour = "none") +
  xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  coord_fixed(ratio = 1) +
  ggtitle("Next 180 largest clusters")
## Joining, by = "umap_hdbscan"
plot_other_clusters <-
  tweets_clusters %>%
  anti_join(
    tweets_clusters %>% 
      filter(umap_hdbscan != -1) %>% 
      count(umap_hdbscan) %>% 
      ungroup() %>% 
      slice_max(order_by = n, n = 200)
  ) %>%
  filter(umap_hdbscan != -1) %>% 
  #count(umap_hdbscan)
  ggplot(aes(
    x = umap_comp1,
    y = umap_comp2,
    colour = as_factor(as.character(umap_hdbscan))
  )) +
  geom_point(size = 0.02) +
  guides(colour = "none") +
  xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  coord_fixed(ratio = 1) +
  ggtitle("All other clusters")
## Joining, by = "umap_hdbscan"
plot_no_clusters <-
  tweets_clusters %>%
  filter(umap_hdbscan == -1) %>% 
  #count(umap_hdbscan)
  ggplot(aes(
    x = umap_comp1,
    y = umap_comp2
  )) +
  geom_point(size = 0.02) +
  xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  coord_fixed(ratio = 1) +
  #coord_cartesian(clip="off") +
  #geom_rangeframe() +
  ggtitle("Unclustered tweets")

plots_all_clusters_layout <-
  c(
    area(1, 1, 2, 2),
    area(1, 3, 2, 4),
    area(1, 5, 2, 6),
    area(3, 1, 5, 3),
    area(3, 4, 5, 6),
    area(6, 1, 8, 3),
    area(6, 4, 8, 6)
  )

plots_all_clusters <-
  (
    # (plot_cluster_size_power + plot_cluster_size_elbow + plot_cluster_size_prob) / 
    # (plot_largest_20_clusters + plot_largest_200_clusters) / 
    # (plot_other_clusters + plot_no_clusters)
    plot_cluster_size_power + plot_cluster_size_elbow + plot_cluster_size_prob +
    plot_largest_20_clusters + plot_largest_200_clusters +
    plot_other_clusters + plot_no_clusters
  ) +
  plot_annotation(
    tag_levels = c("a"),
    title = paste0('HDBSCAN clustering results'),
    caption = 'The colours used in plot e and f are randomised and repeated multiple times for different cluster, aiming to achieve a simple visual discernability of the different clusters rather than full identification.'
  ) + 
  plot_layout(
    design = plots_all_clusters_layout,
    guides = 'collect'
  ) & 
  theme_bw() #+
  #theme_tufte() +
  #theme(legend.position='left')

print(plots_all_clusters)

rm(plot_largest_clusters, plot_other_clusters, plot_no_clusters, plots_all_clusters)
# tweets_clusters %>% 
#   filter(umap_hdbscan != -1) %>% 
#   group_by(umap_hdbscan) %>% 
#   summarise(
#     umap_hdbscan_prob = mean(umap_hdbscan_prob), 
#     umap_hdbscan_outlier_score = mean(umap_hdbscan_outlier_score), 
#     n = n()
#   ) %>% 
#   View()

tweets_clusters %>% 
  filter(umap_hdbscan != -1) %>% 
  ggplot(aes(
    x = umap_hdbscan_prob
  )) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tweets_clusters %>% 
  filter(umap_hdbscan != -1) %>% 
  ggplot(aes(
    x = umap_hdbscan_outlier_score
  )) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tweets_clusters %>% 
  filter(umap_hdbscan != -1) %>% 
  group_by(umap_hdbscan) %>% 
  summarise(
    umap_hdbscan_prob = mean(umap_hdbscan_prob), 
    umap_hdbscan_outlier_score = mean(umap_hdbscan_outlier_score), 
    n = n()
  ) %>% 
  filter(n > 500) %>% 
  ggplot(aes(
    x = umap_hdbscan_prob,
    y = umap_hdbscan_outlier_score,
    size = n
  )) +
  geom_point()

tweets_clusters %>% 
  filter(umap_hdbscan != -1) %>% 
  group_by(umap_hdbscan) %>% 
  summarise(
    umap_hdbscan_prob = mean(umap_hdbscan_prob), 
    umap_hdbscan_outlier_score = mean(umap_hdbscan_outlier_score), 
    n = n()
  ) %>% 
  ggplot(aes(
    x = n,
    y = umap_hdbscan_prob
  )) +
  geom_point(size = 0.1) +
  scale_x_log10()

Word frequency analysis

Simple count along with tf-idf was used to assess the words most likely to characterise each cluster. The tweet_text_clr_emo version of the text was used, which is the same adopted for topic modelling.

tweets_corpus <- 
  read_csv("../storage/MMc_TwitterLeicester2018-2019_all-en_clr-emo-lemm-multi_no-spam_with-sentimentr.csv") %>%
  mutate(tweet_id_str = str_remove(tweet_url, ".*status\\/"))
## Rows: 849988 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): user_url, user_name, user_screen_name, tweet_url, tweet_media_url...
## dbl   (4): tweet_id_str, tweet_geo_long, tweet_geo_lat, tweet_sentimentr_sen...
## dttm  (1): timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Check IDs
(tweets_corpus %>% nrow()) == (tweets_corpus %>% pull(tweet_id_str) %>% unique() %>% length())
## [1] TRUE
tweets_clusters_umap_hdbscan_with_text <- 
  tweets_corpus %>%
  select(tweet_url, tweet_text_clr_emo) %>% 
  right_join(
    tweets_clusters %>%
      select(tweet_url, umap_hdbscan) %>% 
      filter(umap_hdbscan != -1)
  )
## Joining, by = "tweet_url"
tweets_clusters_umap_hdbscan_with_words <-
  tweets_clusters_umap_hdbscan_with_text %>% 
  select(umap_hdbscan, tweet_text_clr_emo) %>% 
  unnest_tokens(word, tweet_text_clr_emo) %>%
  anti_join(stop_words) %>%
  filter(!(
    word %in% c("email", "website", "username", "date", "time", "money", "amp", "leicester")
  )) %>% 
  count(umap_hdbscan, word, sort = TRUE)
## Joining, by = "word"
tweets_clusters_umap_hdbscan_total_words <- 
  tweets_clusters_umap_hdbscan_with_words %>% 
  group_by(umap_hdbscan) %>% 
  summarize(total = sum(n))

tweets_clusters_umap_hdbscan_with_words <- 
  tweets_clusters_umap_hdbscan_with_words %>% 
  left_join(tweets_clusters_umap_hdbscan_total_words)
## Joining, by = "umap_hdbscan"
tweets_clusters_umap_hdbscan_tf_idf <- 
  tweets_clusters_umap_hdbscan_with_words %>%
  bind_tf_idf(word, umap_hdbscan, n)

Exploring clusters

tweets_clusters_umap_hdbscan_tf_idf %>% 
  group_by(umap_hdbscan) %>% 
  slice_max(order_by = tf_idf, n = 10) %>% 
  ungroup() %>% 
  select(umap_hdbscan, word) %>% 
  group_by(umap_hdbscan) %>% 
  summarise(words = paste0(word, collapse = ", ")) %>% 
  left_join(
    tweets_clusters %>%
      group_by(umap_hdbscan) %>% 
      summarise(
        mean_umap_hdbscan_prob = mean(umap_hdbscan_prob), 
        sd_umap_hdbscan_prob = sd(umap_hdbscan_prob), 
        mean_umap_hdbscan_outlier_score = mean(umap_hdbscan_outlier_score), 
        n_of_tweets = n()
      )
  ) %>%
  arrange(-n_of_tweets) 
## Joining, by = "umap_hdbscan"
## # A tibble: 1,739 × 6
##    umap_hdbscan words         mean_umap_hdbsc… sd_umap_hdbscan… mean_umap_hdbsc…
##           <dbl> <chr>                    <dbl>            <dbl>            <dbl>
##  1         1016 laughing, pe…            0.999          0.00348           0.0123
##  2          906 question, fu…            0.999          0.00585           0.0207
##  3          543 lcfc, league…            0.994          0.0230            0.0236
##  4          765 sleep, tired…            0.997          0.0123            0.0208
##  5          458 laughing, lo…            0.975          0.0633            0.0630
##  6          362 bro, mate, c…            0.982          0.0582            0.107 
##  7          504 agree, peopl…            0.958          0.0833            0.102 
##  8          346 laughing, lo…            0.965          0.116             0.0963
##  9          802 liverpool, l…            0.996          0.0127            0.0210
## 10          984 hope, xx, co…            0.996          0.0109            0.0208
## # … with 1,729 more rows, and 1 more variable: n_of_tweets <int>
  #%>% 
  #kable()
tweets_clusters_tfidf <-
  tweets_clusters %>% 
  select(tweet_id_str:umap_comp2,umap_hdbscan:umap_hdbscan_outlier_score) %>% 
  left_join(
    tweets_clusters_umap_hdbscan_tf_idf %>% 
    group_by(umap_hdbscan) %>% 
    slice_max(order_by = tf_idf, n = 10) %>% 
    ungroup() %>% 
    select(umap_hdbscan, word) %>% 
    group_by(umap_hdbscan) %>% 
    summarise(umap_hdbscan_tfidf10 = paste0(word, collapse = " + "))
  ) 
## Joining, by = "umap_hdbscan"
tweets_clusters_tfidf %>%
  write_csv("../storage/MMc_TwitterLeicester2018-2019_all-en_clr-emo-lemm-multi_no-spam_clr-noemo_embeddings-twitter-roberta-base_umap-2d_dbscans_tfidf.csv")