tweet_classifications <-
  readRDS("../storage/MMc_TwitterLeicester2018-2019_all-en_clr-emo-lemm-multi_no-spam_with-all-tweet-classif-v0-2-0.rds")

Checking confidence distributions

Sentiment analysis

tweet_classifications %>% 
  ggplot(aes(x = tweet_sentimentr_sentiment)) +
  geom_histogram(binwidth = 0.1) +
  ylim(-100, 350000) +
  theme_bw()

tweet_classifications %>% 
  filter(tweet_flair_sentiment_value == "POSITIVE") %>% 
  ggplot(aes(x = tweet_flair_sentiment_confidence)) +
  geom_histogram(binwidth = 0.1) +
  xlim(-0.1,1.1) +
  ylim(-100, 350000) +
  theme_bw()
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

tweet_classifications %>% 
  filter(tweet_flair_sentiment_value == "NEGATIVE") %>% 
  ggplot(aes(x = tweet_flair_sentiment_confidence)) +
  geom_histogram(binwidth = 0.1) +
  xlim(-0.1,1.1) +
  ylim(-100, 350000) +
  theme_bw()
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

Based on the distributions illustrated above, the values produced by sentimentr are re-classified as positive if the sentiment value is greater than zero, negative if lower than zero, and neutral if zero. Similarly, values produced by flair are re-classified as neutral if the confidence value is lower than 0.95 and according to the sentiment value otherwise.

tweet_classifications <-
  tweet_classifications %>% 
  mutate(
    tweet_sentimentr_class = ordered(
      case_when(
        tweet_sentimentr_sentiment <= -0.1 ~ "Negative",
        tweet_sentimentr_sentiment >= 0.1 ~ "Positive",
        TRUE ~ "Neutral"
      ),
      levels = c("Negative", "Neutral", "Positive")
    ),
    tweet_flair_sentiment_class = ordered(
      case_when(
        tweet_flair_sentiment_confidence < 0.95 ~ "Neutral",
        tweet_flair_sentiment_value == "NEGATIVE" ~ "Negative",
        tweet_flair_sentiment_value == "POSITIVE" ~ "Positive"
      ),
      levels = c("Negative", "Neutral", "Positive")
    )
  )

Emotion and context

tweet_classifications %>% 
  select(tweet_flair_e6c11m2_admiration:tweet_flair_e6c11m2_top_emotion) %>% 
  pivot_longer(
    cols = tweet_flair_e6c11m2_admiration:tweet_flair_e6c11m2_sadness,
    names_to = "emotion",
    values_to = "tweet_flair_e6c11m2_top_emotion_confidence"
  ) %>% 
  mutate(
    emotion = str_remove(emotion, "tweet_flair_e6c11m2_")
  ) %>% 
  filter(
    emotion == tweet_flair_e6c11m2_top_emotion
  ) %>% 
  ggplot(aes(x = tweet_flair_e6c11m2_top_emotion_confidence)) +
  geom_histogram(binwidth = 0.01)

tweet_classifications %>% 
  select(tweet_flair_c6c12m1_commercial:tweet_flair_c6c12m1_top_context) %>% 
  pivot_longer(
    cols = tweet_flair_c6c12m1_commercial:tweet_flair_c6c12m1_place_character,
    names_to = "context",
    values_to = "tweet_flair_c6c12m1_top_context_confidence"
  ) %>% 
  mutate(
    context = str_remove(context, "tweet_flair_c6c12m1_")
  ) %>% 
  filter(
    context == (tweet_flair_c6c12m1_top_context %>% 
      str_replace_all(" and ", "_") %>% 
      str_replace_all(", ", "_") %>% 
      str_replace_all(" ", "_"))
  ) %>% 
  ggplot(aes(x = tweet_flair_c6c12m1_top_context_confidence)) +
  geom_histogram(binwidth = 0.01)

Based on the distributions illustrated above, the values produced by the flair text classifier models are re-classified as uncertain if the confidence value is lower than 0.95 and according to the model otherwise.

tweet_classifications <-
  tweet_classifications %>% 
  left_join(
    tweet_classifications %>% 
      select(tweet_id_str, tweet_flair_e6c11m2_admiration:tweet_flair_e6c11m2_top_emotion) %>% 
      pivot_longer(
        cols = tweet_flair_e6c11m2_admiration:tweet_flair_e6c11m2_sadness,
        names_to = "emotion",
        values_to = "tweet_flair_e6c11m2_top_emotion_confidence"
      ) %>% 
      mutate(
        emotion = str_remove(emotion, "tweet_flair_e6c11m2_")
      ) %>% 
      filter(
        emotion == tweet_flair_e6c11m2_top_emotion
      ) %>%
      select(-emotion)
  ) %>% 
  relocate(
    tweet_flair_e6c11m2_top_emotion_confidence,
    .after = tweet_flair_e6c11m2_top_emotion
  ) %>% 
  left_join(
    tweet_classifications %>%
      select(tweet_id_str, tweet_flair_c6c12m1_commercial:tweet_flair_c6c12m1_top_context) %>%
      pivot_longer(
        cols = tweet_flair_c6c12m1_commercial:tweet_flair_c6c12m1_place_character,
        names_to = "context",
        values_to = "tweet_flair_c6c12m1_top_context_confidence"
      ) %>%
      mutate(
        context = str_remove(context, "tweet_flair_c6c12m1_")
      ) %>%
      filter(
        context == (tweet_flair_c6c12m1_top_context %>% 
          str_replace_all(" and ", "_") %>% 
          str_replace_all(", ", "_") %>% 
          str_replace_all(" ", "_"))
      ) %>%
      select(-context)
  ) %>%
  relocate(
    tweet_flair_c6c12m1_top_context_confidence,
    .after = tweet_flair_c6c12m1_top_context
  ) %>%
  mutate(
    tweet_flair_e6c11m2_top_emotion = if_else(
      tweet_flair_e6c11m2_top_emotion_confidence < 0.95,
      "uncertain", tweet_flair_e6c11m2_top_emotion
    ),
    tweet_flair_c6c12m1_top_context = if_else(
      tweet_flair_c6c12m1_top_context_confidence < 0.95,
      "uncertain", tweet_flair_c6c12m1_top_context
    )
  )
## Joining, by = c("tweet_id_str", "tweet_flair_e6c11m2_top_emotion")
## Joining, by = c("tweet_id_str", "tweet_flair_c6c12m1_top_context")

Double-check result.

tweet_classifications %>% 
  count(tweet_flair_e6c11m2_top_emotion) %>% 
  kable()
tweet_flair_e6c11m2_top_emotion n
admiration 73547
annoyance 38992
anticipation 40162
excitement 37889
gratitude 10959
interest 39486
joy 579108
sadness 29222
uncertain 623
tweet_classifications %>% 
  count(tweet_flair_c6c12m1_top_context) %>% 
  kable()
tweet_flair_c6c12m1_top_context n
commercial 55849
community events 29706
connecting and sharing 194357
family, friendship and relationships 19574
health, fitness and wellbeing 28137
leisure, hobbies and interests 493237
local environment 7490
place character 9964
uncertain 11674

Matching plots

In this section, we look at the relationship between the different classifications, exploring the number of tweets that are classified at the intersection between classes from different classifications.

Functions

The find_match generates a cross-tabulation of two classifications and identifies as matching the classes the share the highest number of elements falling at their intersection.

find_match <- function(data, a, b){
  # Create crosstabulation
  # as a long data frame
  crosstab_df <- table(
    data %>% pull(.data[[a]]),
    data %>% pull(.data[[b]]),
    dnn = c(a, b)
    ) %>% 
    as.data.frame(stringsAsFactors = FALSE)
  matching <- tibble(class_a = c(NA), class_b = c(NA))
  # Find best match
  while( nrow(crosstab_df) > 0 ){
    # Select the highest value for match for class A
    this_class_a <- 
      crosstab_df %>% 
      slice_max(order_by = Freq, n = 1, with_ties = FALSE) %>% 
      pull(.data[[a]])
    # Select the respective highest value among intersecting class B
    this_class_b <-
      crosstab_df %>% 
      filter(.data[[a]] == this_class_a) %>% 
      slice_max(order_by = Freq, n = 1, with_ties = FALSE) %>% 
      pull(.data[[b]])
    # Add them as a match
    crosstab_df <-
      crosstab_df %>% 
      filter(
        .data[[a]] != this_class_a &
        .data[[b]] != this_class_b
      )
    #cat(this_class_a, this_class_b, "\n")
    matching <-
      matching %>% 
      add_row(
        class_a = this_class_a,
        class_b = this_class_b
      )
  }
  # Add NAs for classes without matches
  matching <-
    matching %>% 
    filter(!is.na(class_a))
  missing_a <- 
    data %>% 
    filter(
      !(.data[[a]] %in% (matching %>% pull(class_a))) &
      !(is.na(.data[[a]]))
    ) %>% 
    pull(.data[[a]]) %>% 
    unique()
  if (length(missing_a) > 0) {
    for (a_missing_a in missing_a) {
      matching <-
        matching %>% 
        add_row(
          class_a = a_missing_a %>% as.character(),
          class_b = NA
        )
    }
  }
  missing_b <- 
    data %>% 
    filter(
      !(.data[[b]] %in% (matching %>% pull(class_b))) &
      !(is.na(.data[[b]]))
    ) %>% 
    pull(.data[[b]]) %>% 
    unique()
  if (length(missing_b) > 0) {
    for (a_missing_b in missing_b) {
      matching <-
        matching %>% 
        add_row(
          class_a = NA,
          class_b = a_missing_b %>% as.character()
        )
    }
  }
  colnames(matching) <- c(a, b)
  matching
}

The match_and_plot creates a heatmap illustrating the number of cases at the intersection of each classes of two classifications, using geom_tile to generate the visualisation and find_match to make the visualisation clearer by sorting the classes so that if there is correlation among the two classification a clear diagonal of high values should appear.

match_and_plot <- function(data, a, a_title, b, b_title){
  
  this_mapping <- find_match(data, a, b)
  
  data_tile_plot_main <-
    data %>%
    count(.data[[a]], .data[[b]]) %>% 
    ggplot(
      aes(
        x = ordered(
          .data[[a]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
        y = ordered(
          .data[[b]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
        fill = n
      )
    ) +
    geom_tile() +
    #ggplot2::scale_fill_viridis_b(
    ggplot2::scale_fill_viridis_c(
      trans = "log",
      # breaks = c(1000, 3000, 10000, 30000, 100000, 300000),
      # labels = c("1,000", "3,000", "10,000", "30,000", "100,000", "300,000"),
      breaks = breaks_log(n = 6, base = 10),
      labels = label_comma(),
      name = "Count"
    ) +
    xlab(a_title) +
    ylab(b_title) +
    coord_fixed(ratio = 1) +
    ggplot2::theme_bw() +
    ggplot2::theme(
      axis.text.x = 
        element_text(
          angle = 45, 
          vjust = 1, 
          hjust = 1
        )
    )
  
  data_tile_plot_A <-
    data %>%
    count(.data[[a]], .data[[b]]) %>% 
    group_by(.data[[a]]) %>% 
    mutate(perc = (n/sum(n))*100) %>% 
    ggplot(
      aes(
        x = ordered(
          .data[[a]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
        y = ordered(
          .data[[b]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
        fill = perc
      )
    ) +
    geom_tile() +
    #ggplot2::scale_fill_viridis_b(
    ggplot2::scale_fill_viridis_c(
      trans = "log",
      # breaks = c(1, 2.5, 5, 10, 25, 50),
      # labels = c("1", "2.5", "5", "10", "25", "50"),
      breaks = breaks_log(n = 6, base = 2),
      labels = label_comma(),
      name = paste("Percentage\nby", a_title)
    ) +
    #xlab("Predicted context") +
    xlab("") +
    #ylab("Predicted emotion") +
    ylab("") +
    coord_fixed(ratio = 1) +
    ggplot2::theme_bw() +
    ggplot2::theme(
      axis.text.x = 
        element_text(
          angle = 45, 
          vjust = 1, 
          hjust = 1
        )
    )
  
  data_tile_plot_B <-
    data %>%
    count(.data[[a]], .data[[b]]) %>% 
    group_by(.data[[b]]) %>% 
    mutate(perc = (n/sum(n))*100) %>% 
    ggplot(
      aes(
        x = ordered(
          .data[[a]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
        y = ordered(
          .data[[b]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
        fill = perc
      )
    ) +
    geom_tile() +
    #ggplot2::scale_fill_viridis_b(
    ggplot2::scale_fill_viridis_c(
      trans = "log",
      # breaks = c(1, 2.5, 5, 10, 25, 50),
      # labels = c("1", "2.5", "5", "10", "25", "50"),
      breaks = breaks_log(n = 6, base = 2),
      labels = label_comma(),
      name = paste("Percentage\nby", b_title)
    ) +
    #xlab("Predicted context") +
    xlab("") +
    #ylab("Predicted emotion") +
    ylab("") +
    coord_fixed(ratio = 1) +
    ggplot2::theme_bw() +
    ggplot2::theme(
      axis.text.x = 
        element_text(
          angle = 45, 
          vjust = 1, 
          hjust = 1
        )
    )
  
  data_tile_all <-
    (data_tile_plot_main + theme(plot.margin = unit(c(0,0,0,0), "pt"))) + 
    (
      (data_tile_plot_A + theme(plot.margin = unit(c(0,0,0,0), "pt"))) /
      (data_tile_plot_B + theme(plot.margin = unit(c(0,0,0,0), "pt")))
    ) 
  
  data_tile_all
  
  # ggsave(
  #   paste0(
  #     "../images/MMc-compare-classifications-tweets_",
  #     a, "_", b, ".pdf"
  #   ),
  #   width = 400, height = 200, units = "mm", 
  #   data_tile_all
  #   )
}

The match_and_plot_top function is similar to the match_and_plot function above, but only the top 20 classes for each classification are visualised, that is the 20 classes (with ties) from each classification that together have the highest number of cases at their intersection.

match_and_plot_top <- function(data, a, a_title, b, b_title){

  this_mapping <- find_match(data, a, b)
  
  top_perc <-
    bind_rows(
      data %>% 
        count(.data[[a]]) %>% 
        slice_max(order_by = n, n = 20) %>% 
        left_join(
          data %>%
            count(.data[[a]], .data[[b]]) %>% 
            filter(
              !is.na(.data[[a]]) &
              !is.na(.data[[b]])
            ) %>% 
            group_by(.data[[a]]) %>% 
            mutate(
              perc = (n/sum(n))*100
            ) %>% 
            slice_max(order_by = perc, n = 1) %>% 
            select(.data[[a]], .data[[b]])
        ),
      data %>% 
        count(.data[[b]]) %>% 
        slice_max(order_by = n, n = 20) %>% 
        left_join(
          data %>%
            count(.data[[a]], .data[[b]]) %>% 
            filter(
              !is.na(.data[[a]]) &
              !is.na(.data[[b]])
            ) %>% 
            group_by(.data[[b]]) %>% 
            mutate(
              perc = (n/sum(n))*100
            ) %>% 
            slice_max(order_by = perc, n = 1) %>% 
            select(.data[[a]], .data[[b]])
        )
    )

  data_tile_plot_main <-
    data %>%
    count(.data[[a]], .data[[b]]) %>%
    ggplot(
      aes(
        x = ordered(
          .data[[a]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
        y = ordered(
          .data[[b]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
        fill = n
      )
    ) +
    geom_tile() +
    #ggplot2::scale_fill_viridis_c(
    ggplot2::scale_fill_viridis_b(
      trans = "log",
      # breaks = c(10, 30, 100, 300, 1000, 3000),
      # labels = c("10", "30", "100", "300", "1,000", "3,000"),
      breaks = breaks_log(n = 6, base = 2),
      labels = label_comma(),
      name = "Count"
    ) +
    xlab(a_title) +
    ylab(b_title) +
    coord_fixed(ratio = 1) +
    ggplot2::theme_bw() +
    ggplot2::theme(
      axis.text.x = 
        element_text(
          angle = 45, 
          vjust = 1, 
          hjust = 1
        )
    ) +
    xlim(top_perc %>% pull(.data[[a]]) %>% unique()) +
    ylim(top_perc %>% pull(.data[[b]]) %>% unique())
    # xlim(data %>% count(.data[[a]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[a]]) %>% unique()) +
    # ylim(data %>% count(.data[[b]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[b]]) %>% unique())
  
  data_tile_plot_A <-
    data %>%
    count(.data[[a]], .data[[b]]) %>%
    group_by(.data[[a]]) %>% 
    mutate(perc = (n/sum(n))*100) %>% 
    ungroup() %>% 
    ggplot(
      aes(
        x = ordered(
          .data[[a]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
        y = ordered(
          .data[[b]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
        fill = perc
      )
    ) +
    geom_tile() +
    # ggplot2::scale_fill_viridis_b(
    ggplot2::scale_fill_viridis_c(
      trans = "log",
      # breaks = c(1, 2.5, 5, 10, 25, 50),
      # labels = c("1", "2.5", "5", "10", "25", "50"),
      breaks = breaks_log(n = 6, base = 2),
      labels = label_comma(),
      name = paste("Percentage\nby", a_title)
    ) +
    #xlab("Predicted context") +
    xlab("") +
    #ylab("Predicted emotion") +
    ylab("") +
    coord_fixed(ratio = 1) +
    ggplot2::theme_bw() +
    ggplot2::theme(
      axis.text.x = 
        element_text(
          angle = 45, 
          vjust = 1, 
          hjust = 1
        )
    ) +
    xlim(top_perc %>% pull(.data[[a]]) %>% unique()) +
    ylim(top_perc %>% pull(.data[[b]]) %>% unique())
    # xlim(data %>% count(.data[[a]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[a]]) %>% unique()) +
    # ylim(data %>% count(.data[[b]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[b]]) %>% unique())
  
  
  data_tile_plot_B <-
    data %>%
    count(.data[[a]], .data[[b]]) %>%
    group_by(.data[[b]]) %>% 
    mutate(perc = (n/sum(n))*100) %>% 
    ungroup() %>% 
    ggplot(
      aes(
        x = ordered(
          .data[[a]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[a]])) %>% pull(.data[[a]])),
        y = ordered(
          .data[[b]] %>% as.character(), 
          levels = this_mapping %>% filter(!is.na(.data[[b]])) %>% pull(.data[[b]]) %>% rev()),
        fill = perc
      )
    ) +
    geom_tile() +
    # ggplot2::scale_fill_viridis_b(
    ggplot2::scale_fill_viridis_c(
      trans = "log",
      # breaks = c(1, 2.5, 5, 10, 25, 50),
      # labels = c("1", "2.5", "5", "10", "25", "50"),
      breaks = breaks_log(n = 6, base = 2),
      labels = label_comma(),
      name = paste("Percentage\nby", b_title)
    ) +
    #xlab("Predicted context") +
    xlab("") +
    #ylab("Predicted emotion") +
    ylab("") +
    coord_fixed(ratio = 1) +
    ggplot2::theme_bw() +
    ggplot2::theme(
      axis.text.x = 
        element_text(
          angle = 45, 
          vjust = 1, 
          hjust = 1
        )
    ) +
    xlim(top_perc %>% pull(.data[[a]]) %>% unique()) +
    ylim(top_perc %>% pull(.data[[b]]) %>% unique())
    # xlim(data %>% count(.data[[a]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[a]]) %>% unique()) +
    # ylim(data %>% count(.data[[b]]) %>% slice_max(order_by = n, n = 20) %>% pull(.data[[b]]) %>% unique())
  
  
  data_tile_all <-
    (data_tile_plot_main + theme(plot.margin = unit(c(0,0,0,0), "pt"))) + 
    (
      (data_tile_plot_A + theme(plot.margin = unit(c(0,0,0,0), "pt"))) /
      (data_tile_plot_B + theme(plot.margin = unit(c(0,0,0,0), "pt")))
    ) 
  
  data_tile_all
  
  # ggsave(
  #   paste0(
  #     "../images/MMc-compare-classifications-tweets_",
  #     a, "_", b, ".pdf"
  #   ),
  #   width = 400, height = 200, units = "mm", limitsize = FALSE,
  #   data_tile_all
  #   )
}

Sentiment analysis

tweet_classifications %>% 
  count(tweet_sentimentr_class, tweet_flair_sentiment_class) %>%
  ggplot(
    aes(
      x = ordered(
        tweet_sentimentr_class, 
        levels = c("Negative", "Neutral", "Positive")),
      y = ordered(
        tweet_flair_sentiment_class, 
        levels = c("Negative", "Neutral", "Positive")),
      size = n,
      colour = n
    )
  ) +
  geom_point() +
  scale_size(name = "Count") +
  scale_colour_viridis_b(
    name = "Count"
  ) +
  xlab("Sentimentr") +
  ylab("Flair Sentiment") +
  coord_fixed(ratio = 1) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    axis.text.x = 
      element_text(
        angle = 45, 
        vjust = 1, 
        hjust = 1
      )
  )

Classification heatmaps

Names of the classes are shorten to make the visualisations more readable.

tweet_classifications_to_plot <-
  tweet_classifications %>% 
  mutate(
    across(
      c(
        tweet_flair_e6c11m2_top_emotion, 
        tweet_flair_c6c12m1_top_context,
        btm200bg_topic_sum_b,
        trans_umap_hdbscan
      ),
      ~str_trunc(.x, 15)
    )
  )
tweet_classifications_to_plot %>% 
  match_and_plot(
    "tweet_flair_e6c11m2_top_emotion", "Emotion",
    "tweet_flair_c6c12m1_top_context", "Context"
  )

tweet_classifications_to_plot %>% 
  # Filter-out massive-reply cluster
  filter(!(trans_umap_hdbscan %in% c("0"))) %>%
  # Plot
  match_and_plot_top(
    "btm200bg_topic_sum_b", "BTM topics",
    "trans_umap_hdbscan", "Trans.-based"
  )
## Warning: Use of .data in tidyselect expressions was deprecated in tidyselect 1.2.0.
## ℹ Please use `all_of(var)` (or `any_of(var)`) instead of `.data[[var]]`
## Joining, by = "btm200bg_topic_sum_b"
## Joining, by = "trans_umap_hdbscan"
## Warning: Removed 66007 rows containing missing values (`geom_tile()`).
## Warning: Removed 66007 rows containing missing values (`geom_tile()`).
## Removed 66007 rows containing missing values (`geom_tile()`).

tweet_classifications_to_plot %>% 
  # Plot
  match_and_plot_top(
    "tweet_flair_e6c11m2_top_emotion", "Emotion",
    "btm200bg_topic_sum_b", "BTM topics"
  )
## Joining, by = "tweet_flair_e6c11m2_top_emotion"
## Joining, by = "btm200bg_topic_sum_b"
## Warning: Removed 1504 rows containing missing values (`geom_tile()`).
## Removed 1504 rows containing missing values (`geom_tile()`).
## Removed 1504 rows containing missing values (`geom_tile()`).

tweet_classifications_to_plot %>% 
  # Plot
  match_and_plot_top(
    "tweet_flair_c6c12m1_top_context", "Context",
    "btm200bg_topic_sum_b", "BTM topics"
  )
## Joining, by = "tweet_flair_c6c12m1_top_context"
## Joining, by = "btm200bg_topic_sum_b"
## Warning: Removed 1581 rows containing missing values (`geom_tile()`).
## Removed 1581 rows containing missing values (`geom_tile()`).
## Removed 1581 rows containing missing values (`geom_tile()`).

tweet_classifications_to_plot %>% 
  # Filter-out massive-reply cluster
  filter(!(trans_umap_hdbscan %in% c("0"))) %>%
  # Plot
  match_and_plot_top(
    "tweet_flair_e6c11m2_top_emotion", "Emotion",
    "trans_umap_hdbscan", "Trans.-based"
  )
## Joining, by = "tweet_flair_e6c11m2_top_emotion"
## Joining, by = "trans_umap_hdbscan"
## Warning: Removed 10081 rows containing missing values (`geom_tile()`).
## Removed 10081 rows containing missing values (`geom_tile()`).
## Removed 10081 rows containing missing values (`geom_tile()`).

tweet_classifications_to_plot %>% 
  # Filter-out massive-reply cluster
  filter(!(trans_umap_hdbscan %in% c("0"))) %>%
  # Plot
  match_and_plot_top(
    "tweet_flair_c6c12m1_top_context", "Context",
    "trans_umap_hdbscan", "Trans.-based"
  )
## Joining, by = "tweet_flair_c6c12m1_top_context"
## Joining, by = "trans_umap_hdbscan"
## Warning: Removed 10143 rows containing missing values (`geom_tile()`).
## Removed 10143 rows containing missing values (`geom_tile()`).
## Removed 10143 rows containing missing values (`geom_tile()`).

tweet_classifications_to_plot %>% 
  # Filter-out massive-reply cluster
  filter(!(trans_umap_hdbscan %in% c("0"))) %>%
  # Plot
  mutate(
    emotion_context = paste0(
      str_trunc(tweet_flair_e6c11m2_top_emotion, 5, ellipsis = ""),
      "+",
      str_trunc(tweet_flair_c6c12m1_top_context, 8, ellipsis = "")
    )
  ) %>%
  match_and_plot_top(
    "emotion_context", "Emotion + Context",
    "btm200bg_topic_sum_b", "BTM topics"
  )
## Joining, by = "emotion_context"
## Joining, by = "btm200bg_topic_sum_b"
## Warning: Removed 8929 rows containing missing values (`geom_tile()`).
## Removed 8929 rows containing missing values (`geom_tile()`).
## Removed 8929 rows containing missing values (`geom_tile()`).

tweet_classifications_to_plot %>% 
  # Filter-out massive-reply cluster
  filter(!(trans_umap_hdbscan %in% c("0"))) %>%
  # Plot
  mutate(
    emotion_context = paste0(
      str_trunc(tweet_flair_e6c11m2_top_emotion, 5, ellipsis = ""),
      "+",
      str_trunc(tweet_flair_c6c12m1_top_context, 8, ellipsis = "")
    )
  ) %>%
  match_and_plot_top(
    "emotion_context", "Emotion + Context",
    "trans_umap_hdbscan", "Trans.-based"
  )
## Joining, by = "emotion_context"
## Joining, by = "trans_umap_hdbscan"
## Warning: Removed 28856 rows containing missing values (`geom_tile()`).
## Removed 28856 rows containing missing values (`geom_tile()`).
## Removed 28856 rows containing missing values (`geom_tile()`).

tweet_classifications_to_plot %>% 
  # Filter-out massive-reply cluster
  filter(!(trans_umap_hdbscan %in% c("0"))) %>%
  # Select clusters
  right_join(
    tweet_classifications_to_plot %>% 
      group_by(trans_umap_hdbscan) %>% 
      summarise(
        n = n(),
        mean_trans_umap_hdbscan_prob = mean(trans_umap_hdbscan_prob),
        sd_trans_umap_hdbscan_prob = sd(trans_umap_hdbscan_prob)
      ) %>%
      ungroup() %>% 
      filter(n > 100 & sd_trans_umap_hdbscan_prob < 0.01) %>% 
      slice_max(order_by = mean_trans_umap_hdbscan_prob, n = 20, with_ties = FALSE) %>% 
      select(trans_umap_hdbscan)
  ) %>% 
  # Plot
  match_and_plot(
    "tweet_flair_e6c11m2_top_emotion", "Emotion",
    "trans_umap_hdbscan", "Trans.-based"
  )
## Joining, by = "trans_umap_hdbscan"

tweet_classifications_to_plot %>% 
  # Filter-out massive-reply cluster
  filter(!(trans_umap_hdbscan %in% c("0"))) %>%
  # Select clusters
  right_join(
    tweet_classifications_to_plot %>% 
      group_by(trans_umap_hdbscan) %>% 
      summarise(
        n = n(),
        mean_trans_umap_hdbscan_prob = mean(trans_umap_hdbscan_prob),
        sd_trans_umap_hdbscan_prob = sd(trans_umap_hdbscan_prob)
      ) %>%
      ungroup() %>% 
      filter(n > 100 & sd_trans_umap_hdbscan_prob < 0.01) %>% 
      slice_max(order_by = mean_trans_umap_hdbscan_prob, n = 20, with_ties = FALSE) %>% 
      select(trans_umap_hdbscan)
  ) %>% 
  # Plot
  match_and_plot(
    "tweet_flair_c6c12m1_top_context", "Context",
    "trans_umap_hdbscan", "Trans.-based"
  )
## Joining, by = "trans_umap_hdbscan"

tweet_classifications_to_plot %>% 
  # Filter-out massive-reply cluster
  filter(!(trans_umap_hdbscan %in% c("0"))) %>%
  # Select clusters
  right_join(
    tweet_classifications_to_plot %>% 
      group_by(trans_umap_hdbscan) %>% 
      summarise(
        n = n(),
        mean_trans_umap_hdbscan_prob = mean(trans_umap_hdbscan_prob),
        sd_trans_umap_hdbscan_prob = sd(trans_umap_hdbscan_prob)
      ) %>%
      ungroup() %>% 
      filter(n > 100 & sd_trans_umap_hdbscan_prob < 0.01) %>% 
      slice_max(order_by = mean_trans_umap_hdbscan_prob, n = 20, with_ties = FALSE) %>% 
      select(trans_umap_hdbscan)
  ) %>% 
  # Plot
  match_and_plot(
    "tweet_flair_c6c12m1_top_context", "Context",
    "trans_umap_hdbscan", "Trans.-based"
  )
## Joining, by = "trans_umap_hdbscan"

UMAP plots

The plots below illustrate the relationship between different classifications in a area of the 2-dimensional space creted through the UMAP dimensionality reduction process.

The following code defines the colour palettes used in the plots below, based on the colour-blind safe palettes developed by Paul Tol.

# Grey colour from the Vibrant qualitative palette by Paul Tol
# https://personal.sron.nl/~pault/
palette_PaulTol_VibrantGrey <- "#BBBBBB"
  
# High-contrast qualitative colour palette by Paul Tol
# https://personal.sron.nl/~pault/
palette_PaulTol_RYB <- c("#BB5566", "#DDAA33", "#004488")

# Muted qualitative colour palette by Paul Tol
# https://personal.sron.nl/~pault/
palette_PaulTol_Muted <- c("#CC6677", "#332288", "#DDCC77", "#117733", "#88CCEE", "#882255", "#44AA99", "#999933", palette_PaulTol_VibrantGrey)

# Light qualitative colour palette by Paul Tol
# https://personal.sron.nl/~pault/
palette_PaulTol_Light <- c("#77AADD", "#EE8866", "#EEDD88", "#AAAA00", "#99DDFF", "#44BB99", "#BBCC33", "#FFAABB", "#DDDDDD")

Example 1: xlim(0, 4) + ylim(-3, 1)

umap_plane_sentimentr <-
  tweet_classifications %>%
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    #colour = tweet_sentimentr_sentiment
    colour = tweet_sentimentr_class
  )) +
  geom_point(size = 0.01) +
  # scale_color_distiller(
  #   name = "Sentimentr (plot a)",
  #   palette = "RdYlBu", 
  #   direction = 1,
  #   limits = c(-1, 1),
  #   breaks = c(-1, -0.5, 0, 0.5, 1),
  #   label = c("<= -1", "-0.5", "0", "0.5", ">= 1"),
  #   oob = squish
  # ) +
  # scale_colour_brewer(
  #   palette = "RdYlBu"
  # ) +
  scale_colour_manual(
    values = palette_PaulTol_RYB
  ) +
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(0, 4) + ylim(-3, 1) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("Sentimentr") +
  theme_bw() +
  theme(legend.position = "none")

umap_plane_flair_sentiment <-
  tweet_classifications %>%
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    #colour = tweet_flair_sentiment_value
    colour = tweet_flair_sentiment_class
  )) +
  geom_point(size = 0.01) +
  # scale_colour_brewer(
  #   palette = "Set3"
  # ) +
  # scale_colour_brewer(
  #   palette = "RdYlBu"
  # ) +
  scale_colour_manual(
    values = palette_PaulTol_RYB
  ) +
  guides(colour = guide_legend(
    "Sentiment (plot a and b)",
    override.aes = list(size=5)
  )) +
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(0, 4) + ylim(-3, 1) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("Flair sentiment") +
  theme_bw()
  #+ theme(legend.position = "none")

umap_plane_btm <-
  tweet_classifications %>%
  mutate(
    btm200bg_topic_sum_b = na_if(btm200bg_topic_sum_b, -1)
  ) %>% 
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    colour = as_factor(as.character(btm200bg_topic_sum_b))
  )) +
  geom_point(size = 0.01) +
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(0, 4) + ylim(-3, 1) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("BTM (repeated colours)") +
  theme_bw() +
  theme(legend.position = "none")

umap_plane_trans <-
  tweet_classifications %>%
  mutate(
    trans_umap_hdbscan = na_if(trans_umap_hdbscan, -1)
  ) %>% 
  left_join(
    tweet_classifications %>% 
      filter(
        trans_umap_hdbscan %in% c("250", "251", "458", "456")
      ) %>% 
      group_by(trans_umap_hdbscan) %>% 
      # select the point closer to the
      # left border of the plot to label
      mutate(
        dist_from_origin = sqrt(((trans_umap_comp1 + 1) ^ 2) + ((trans_umap_comp2 + 2) ^ 2))
      ) %>% 
      slice_min(
        order_by = dist_from_origin,
        n=1
      ) %>% 
      mutate(
        plot_label = trans_umap_hdbscan
      ) %>% 
      select(tweet_id_str, plot_label)
  ) %>% 
  #replace_na(list(plot_label = "")) %>% 
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    colour = as_factor(as.character(trans_umap_hdbscan)),
    label = plot_label
  )) +
  geom_point(size = 0.01) +
  #geom_label(
  geom_label_repel(
    fill = "#FFFFFF",
    colour = "#000000",
    alpha = 0.7,
    #size = 3,
    force_pull = 0.5,
    box.padding = 0,
    max.overlaps = Inf,
    segment.curvature = -0.1,
    min.segment.length = 3
  ) + 
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(0, 4) + ylim(-3, 1) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("Transformer-based (repeated colours)") +
  #theme(legend.position = "none")
  scale_colour_discrete(
    breaks = c("250"), 
    labels = c("The colours used in plot c and\nd are randomised and repeated\nmultiple times for different cluster,\naiming to achieve a simple visual\ndiscernability of the clusters\nrather than full identification.")
  ) + 
  guides(colour = guide_legend(
    "Topic (plot c and d)",
    override.aes = list(alpha=0.0)
  )) +
  theme_bw()
## Adding missing grouping variables: `trans_umap_hdbscan`
## Joining, by = c("tweet_id_str", "trans_umap_hdbscan")
umap_plane_emotion <-
  tweet_classifications %>%
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    colour = tweet_flair_e6c11m2_top_emotion
  )) +
  geom_point(size = 0.01) +
  # scale_colour_brewer(
  #   palette = "Set3"
  # ) +
  scale_colour_manual(
    values = palette_PaulTol_Light
  ) +
  guides(colour = guide_legend(
    "Emotion (plot e)",
    override.aes = list(size=5)
  )) +
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(0, 4) + ylim(-3, 1) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("Emotion") +
  theme_bw()
  #+ theme(legend.position = "none")

umap_plane_context <-
  tweet_classifications %>%
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    colour = tweet_flair_c6c12m1_top_context
  )) +
  geom_point(size = 0.01) +
  # scale_colour_brewer(
  #   palette = "Set1"
  # ) +
  scale_colour_manual(
    values = palette_PaulTol_Muted
  ) +
  guides(colour = guide_legend(
    "Context (plot f)",
    override.aes = list(size=5)
  )) +
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(0, 4) + ylim(-3, 1) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("Context") +
  theme_bw()
  #+ theme(legend.position = "none")

(umap_plane_sentimentr + umap_plane_flair_sentiment) /
(umap_plane_btm + umap_plane_trans) /
(umap_plane_emotion + umap_plane_context) +
  plot_annotation(
    tag_levels = c("a")
    #, caption = 'The colours used in plot c and b are randomised and repeated multiple times for different cluster, aiming\nto achieve a simple visual discernability of the different clusters rather than full identification.'
  ) + 
  plot_layout(
    guides = 'collect'
  )
## Warning: Removed 802815 rows containing missing values (`geom_point()`).
## Removed 802815 rows containing missing values (`geom_point()`).
## Removed 802815 rows containing missing values (`geom_point()`).
## Removed 802815 rows containing missing values (`geom_point()`).
## Warning: Removed 849984 rows containing missing values (`geom_label_repel()`).
## Warning: Removed 802815 rows containing missing values (`geom_point()`).
## Removed 802815 rows containing missing values (`geom_point()`).

Example 2: xlim(6, 10) + ylim(3, 7)

umap_plane_sentimentr <-
  tweet_classifications %>%
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    #colour = tweet_sentimentr_sentiment
    colour = tweet_sentimentr_class
  )) +
  geom_point(size = 0.01) +
  # scale_color_distiller(
  #   name = "Sentimentr (plot a)",
  #   palette = "RdYlBu", 
  #   direction = 1,
  #   limits = c(-1, 1),
  #   breaks = c(-1, -0.5, 0, 0.5, 1),
  #   label = c("<= -1", "-0.5", "0", "0.5", ">= 1"),
  #   oob = squish
  # ) +
  # scale_colour_brewer(
  #   palette = "RdYlBu"
  # ) +
  scale_colour_manual(
    values = palette_PaulTol_RYB
  ) +
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(6, 10) + ylim(3, 7) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("Sentimentr") +
  theme_bw() +
  theme(legend.position = "none")

umap_plane_flair_sentiment <-
  tweet_classifications %>%
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    #colour = tweet_flair_sentiment_value
    colour = tweet_flair_sentiment_class
  )) +
  geom_point(size = 0.01) +
  # scale_colour_brewer(
  #   palette = "Set3"
  # ) +
  # scale_colour_brewer(
  #   palette = "RdYlBu"
  # ) +
  scale_colour_manual(
    values = palette_PaulTol_RYB
  ) +
  guides(colour = guide_legend(
    "Sentiment (plot a and b)",
    override.aes = list(size=5)
  )) +
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(6, 10) + ylim(3, 7) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("Flair sentiment") +
  theme_bw()
  #+ theme(legend.position = "none")

umap_plane_btm <-
  tweet_classifications %>%
  mutate(
    btm200bg_topic_sum_b = na_if(btm200bg_topic_sum_b, -1)
  ) %>% 
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    colour = as_factor(as.character(btm200bg_topic_sum_b))
  )) +
  geom_point(size = 0.01) +
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(6, 10) + ylim(3, 7) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("BTM (repeated colours)") +
  theme_bw() +
  theme(legend.position = "none")

umap_plane_trans <-
  tweet_classifications %>%
  mutate(
    trans_umap_hdbscan = na_if(trans_umap_hdbscan, -1)
  ) %>% 
  left_join(
    tweet_classifications %>% 
      filter(
        trans_umap_hdbscan %in% c("1016", "543", "627", "746", "869")
      ) %>% 
      group_by(trans_umap_hdbscan) %>% 
      # select the point closer to the
      # left border of the plot to label
      mutate(
        dist_from_origin = sqrt(((trans_umap_comp1 - 6) ^ 2) + ((trans_umap_comp2 - 7) ^ 2))
      ) %>% 
      slice_min(
        order_by = dist_from_origin,
        n = 1
      ) %>% 
      mutate(
        plot_label = trans_umap_hdbscan
      ) %>% 
      select(tweet_id_str, plot_label)
  ) %>% 
  #replace_na(list(plot_label = "")) %>% 
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    colour = as_factor(as.character(trans_umap_hdbscan)),
    label = plot_label
  )) +
  geom_point(size = 0.01) +
  #geom_label(
  geom_label_repel(
    fill = "#FFFFFF",
    colour = "#000000",
    alpha = 0.7,
    #size = 3,
    force_pull = 0.5,
    box.padding = 0,
    max.overlaps = Inf,
    segment.curvature = -0.1,
    min.segment.length = 0
  ) + 
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(6, 10) + ylim(3, 7) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("Transformer-based (repeated colours)") +
  theme_bw() +
  theme(legend.position = "none")
## Adding missing grouping variables: `trans_umap_hdbscan`
## Joining, by = c("tweet_id_str", "trans_umap_hdbscan")
umap_plane_emotion <-
  tweet_classifications %>%
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    colour = tweet_flair_e6c11m2_top_emotion
  )) +
  geom_point(size = 0.01) +
  # scale_colour_brewer(
  #   palette = "Set3"
  # ) +
  scale_colour_manual(
    values = palette_PaulTol_Light
  ) +
  guides(colour = guide_legend(
    "Emotion (plot e)",
    override.aes = list(size=5)
  )) +
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(6, 10) + ylim(3, 7) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("Emotion") +
  theme_bw()
  #+ theme(legend.position = "none")

umap_plane_context <-
  tweet_classifications %>%
  ggplot(aes(
    x = trans_umap_comp1,
    y = trans_umap_comp2,
    colour = tweet_flair_c6c12m1_top_context
  )) +
  geom_point(size = 0.01) +
  # scale_colour_brewer(
  #   palette = "Set1"
  # ) +
  scale_colour_manual(
    values = palette_PaulTol_Muted
  ) +
  guides(colour = guide_legend(
    "Context (plot f)",
    override.aes = list(size=5)
  )) +
  #xlim(-12.5, 22.5) + ylim(-12.5, 22.5) +
  xlim(6, 10) + ylim(3, 7) +
  xlab("Transformer-UMAP comp. 1") +
  ylab("Transformer-UMAP comp. 2") +
  coord_fixed(ratio = 1) +
  ggtitle("Context") +
  theme_bw()
  #+ theme(legend.position = "none")

(umap_plane_sentimentr + umap_plane_flair_sentiment) /
(umap_plane_btm + umap_plane_trans) /
(umap_plane_emotion + umap_plane_context) +
  plot_annotation(
    tag_levels = c("a"),
    caption = 'The colours used in plot c and b are randomised and repeated multiple times for different cluster, aiming\nto achieve a simple visual discernability of the different clusters rather than full identification.'
  ) + 
  plot_layout(
    guides = 'collect'
  )
## Warning: Removed 689060 rows containing missing values (`geom_point()`).
## Removed 689060 rows containing missing values (`geom_point()`).
## Removed 689060 rows containing missing values (`geom_point()`).
## Removed 689060 rows containing missing values (`geom_point()`).
## Warning: Removed 849983 rows containing missing values (`geom_label_repel()`).
## Warning: Removed 689060 rows containing missing values (`geom_point()`).
## Removed 689060 rows containing missing values (`geom_point()`).