if (!require("pacman")) install.packages("pacman")
pacman::p_load(
    here, 
    magrittr, janitor,
    ggpubr, 
    gt, gtExtras,
    countdown, 
    quanteda, # quanteda text processing
    quanteda.textplots, quanteda.textstats, quanteda.textmodels,
    tidytext, 
    udpipe, spacyr, # POS tagging
    stm, stminsights,
    easystats, tidyverse
)Topic Modeling with stminsights
Session 09 - 🔨 Advanced Methods in R
Ziel der Anwendung: Anwendung von 
stminsights
- Auf- bzw. Vorbereitung der Daten zur Anwendung mit stminsights(v0.4.3, Schwemmer 2021)
Background
Todays’s data basis: Topic Modeling
Transcripts & Chats of the Live-Streams from hasanabi and zackrawrr and | TheMajorityReport for the Presidential (Harris vs. Trump) and Vice-Presidential (Vance vs. Walz) Debates 2024
- The best way to learn R is by trying. This document tries to display a version of the “normal” data processing procedure.
- Use tidytuesdaydata as an example to showcase the potential
Preparation
Packages
The pacman::p_load() function from the pacman package is used to load the packages, which has several advantages over the conventional method with library():
- Concise syntax
- Automatic installation (if the package is not already installed)
- Loading multiple packages at once
- Automatic search for dependencies
Import und Vorverarbeitung der Daten
Information
For information about how the data, especially the topic modeling results, were prepared, processed and estimated, please see the tutorial.
# Import base data
chats <- qs::qread(here("local_data/chat-debates_full.qs"))$correct
# Import corpora
chats_spacyr <- qs::qread(here("local_data/chat-corpus_spacyr.qs"))
stm_search <- qs::qread(here("local_data/stm-majority_report-search.qs"))
stm_results <- qs::qread(here("local_data/stm-majority_report-results.qs"))Vorverarbeitung der Daten
chats_valid <- chats %>% 
  mutate(
    across(c(debate, platform), ~as.factor(.x))
  ) Vorverarbeitung des Korpus
# spacyr-Korpus zu Tokens
chat_spacyr_toks <- chats_spacyr %>% 
  as.tokens(
    use_lemma = TRUE
  ) %>% 
  tokens(
    remove_punct = TRUE, 
    remove_symbols = TRUE,
    remove_numbers = FALSE,
    remove_url = FALSE, 
    split_hyphens = FALSE,
    split_tags = FALSE,
  ) %>% 
  tokens_remove(
    pattern = stopwords("en")
  ) %>% 
  tokens_ngrams(n = 1:3) Add docvars
# Get document names from the original data
original_docnames <- chats$message_id
# Get document names from the tokens object
token_docnames <- docnames(chat_spacyr_toks)
# Identify missing documents
missing_docs <- setdiff(original_docnames, token_docnames)
# Exclude "empty" messages
chats_filtered <- chats_valid %>% 
  filter(!message_id %in% missing_docs)
# Add docvars
docvars(chat_spacyr_toks) <- chats_filteredFokus on The Majority Report
# Subset tokens based on docvars
majority_report_chat_toks <- tokens_subset(
  chat_spacyr_toks, streamer == "the_majority_report")# Convert to DFM
majority_report_chat_dfm <- majority_report_chat_toks %>% 
  dfm()
# Pruning
majority_report_chat_trim <- majority_report_chat_dfm %>% 
    dfm_trim(
        min_docfreq = 50/nrow(chats),
        max_docfreq = 0.99, 
        docfreq_type = "prop"
   )
# Convert for stm topic modeling
majority_report_chat_stm <- majority_report_chat_trim %>% 
   convert(to = "stm")empty_docs <- Matrix::rowSums(
  as(majority_report_chat_trim, "Matrix")) == 0 
empty_docs_ids <- majority_report_chat_trim@docvars$docname[empty_docs]
chats_model <- chats_filtered %>% 
  filter(!(message_id %in% empty_docs_ids)) %>% 
  filter(streamer == "the_majority_report")Export topic models
K = 12
# Get model
tpm_k12 <- stm_results %>% 
   filter(k == 12) |> 
   pull(mdl) %>% .[[1]]
# Estimate effects
effects_k12 <- estimateEffect(
  formula =~ platform + debate + message_during_debate,
  stmobj = tpm_k12, 
  metadata = chats_model)K = 14
# Get model
tpm_k14 <- stm_results %>% 
   filter(k == 14) |> 
   pull(mdl) %>% .[[1]]
# Estimate effects
effects_k14 <- estimateEffect(
  formula =~ platform + debate + message_during_debate,
  stmobj = tpm_k14, 
  metadata = chats_model)K = 18
# Get model
tpm_k18 <- stm_results %>% 
   filter(k == 18) |> 
   pull(mdl) %>% .[[1]]
# Estimate effects
effects_k18 <- estimateEffect(
  formula =~ platform + debate + message_during_debate,
  stmobj = tpm_k18, 
  metadata = chats_model)🛠️ Praktische Übung
Achtung, bitte lesen!
- Bevor Sie mit der Arbeit an den folgenden 📋 Exercises beginnen, stellen Sie bitte sicher, dass Sie alle Chunks des Abschnitts Preparation ausgeführt haben. Das können Sie tun, indem Sie den “Run all chunks above”-Knopf  des nächsten Chunks benutzen. des nächsten Chunks benutzen.
- Bei Fragen zum Code lohnt sich ein Blick in das Tutorial (.qmd oder .html). Beim Tutorial handelt es sich um eine kompakte Darstellung des in der Präsentation verwenden R-Codes. Sie können das Tutorial also nutzen, um sich die Code-Bausteine anzusehen, die für die R-Outputs auf den Slides benutzt wurden.
Prepare and save workspace
# Set names required by stminsights
data <- majority_report_chat_trim
out <- majority_report_chat_stm
# Clean workspace
rm(list = setdiff(
  ls(),
  c("data", "out",
    "tpm_k12", "tpm_k14", "tpm_k18", 
    "effects_k12", "effects_k14", "effects_k18")))
# Save workspace
save.image(here("stm_session_09.RData"))Start stminsights
library(stminsights)
run_stminsights()References
Schwemmer, Carsten. 2021. Stminsights: A Shiny Application for Inspecting Structural Topic Models. https://github.com/cschwem2er/stminsights.
