🔨 Topic Modeling in R

Tutorial - Session 09

Published

04.01.2025

Open interactive and executable RStudio environment

Background

Preparation

if (!require("pacman")) install.packages("pacman")
pacman::p_load(
    here, 
    magrittr, janitor,
    ggpubr, 
    gt, gtExtras,
    countdown, 
    quanteda, # quanteda text processing
    quanteda.textplots, quanteda.textstats, quanteda.textmodels,
    tidytext, 
    udpipe, spacyr, # POS tagging
    stm, stminsights,
    easystats, tidyverse
)

# Import base data
chats <- qs::qread(here("local_data/chat-debates_full.qs"))$correct

# Import corpora
chats_spacyr <- qs::qread(here("local_data/chat-corpus_spacyr.qs"))
stm_search <- qs::qread(here("local_data/stm-majority_report-search.qs"))
stm_results <- qs::qread(here("local_data/stm-majority_report-results.qs"))

Codechunks aus der Sitzung

Vorverarbeitung der Daten

chats_valid <- chats %>% 
  mutate(
    across(c(debate, platform), ~as.factor(.x))
  )

Vorverarbeitung des Korpus

# spacyr-Korpus zu Tokens
chat_spacyr_toks <- chats_spacyr %>% 
  as.tokens(
    use_lemma = TRUE
  ) %>% 
  tokens(
    remove_punct = TRUE, 
    remove_symbols = TRUE,
    remove_numbers = FALSE,
    remove_url = FALSE, 
    split_hyphens = FALSE,
    split_tags = FALSE,
  ) %>% 
  tokens_remove(
    pattern = stopwords("en")
  ) %>% 
  tokens_ngrams(n = 1:3)

Add docvars

# Get document names from the original data
original_docnames <- chats$message_id

# Get document names from the tokens object
token_docnames <- docnames(chat_spacyr_toks)

# Identify missing documents
missing_docs <- setdiff(original_docnames, token_docnames)

# Exclude "empty" messages
chats_filtered <- chats_valid %>% 
  filter(!message_id %in% missing_docs)

# Add docvars
docvars(chat_spacyr_toks) <- chats_filtered

Fokus on The Majority Report

# Subset tokens based on docvars
majority_report_chat_toks <- tokens_subset(
  chat_spacyr_toks, streamer == "the_majority_report")

# Convert to DFM
majority_report_chat_dfm <- majority_report_chat_toks %>% 
  dfm()

# Pruning
majority_report_chat_trim <- majority_report_chat_dfm %>% 
    dfm_trim(
        min_docfreq = 50/nrow(chats),
        max_docfreq = 0.99, 
        docfreq_type = "prop"
   )

# Convert for stm topic modeling
majority_report_chat_stm <- majority_report_chat_trim %>% 
   convert(to = "stm")

Estimate models

# Define parameters
future::plan(future::multisession()) # use multiple sessions
topic_range <- seq(from = 4, to = 20, by = 2) 

# Initiate notifications & time tracking
tictoc::tic("STM chats - Majority Report")

# Estimate models
stm_search  <- tibble(k = topic_range) %>%
    mutate(
        mdl = furrr::future_map(
            k, 
            ~stm::stm(
                documents = majority_report_chat_stm$documents,
                vocab = majority_report_chat_stm$vocab, 
                prevalence =~ platform + debate + message_during_debate, 
                K = ., 
                seed = 42,
                max.em.its = 1000,
                data = majority_report_chat_stm$meta,
                init.type = "Spectral",
                verbose = TRUE),
            .options = furrr::furrr_options(seed = 42)
            )
    )

# Sent status update and finish time tracking
tictoc::toc(log = TRUE)

# Create heldout
heldout <- make.heldout(
  majority_report_chat_stm$documents,
  majority_report_chat_stm$vocab,
  seed = 42)

# Create model diagnostics
stm_results <- stm_search %>%
  mutate(
    exclusivity = map(mdl, exclusivity),
    semantic_coherence = map(mdl, semanticCoherence, majority_report_chat_stm$documents),
    eval_heldout = map(mdl, eval.heldout, heldout$missing),
    residual = map(mdl, checkResiduals, majority_report_chat_stm$documents),
    bound =  map_dbl(mdl, function(x) max(x$convergence$bound)),
    lfact = map_dbl(mdl, function(x) lfactorial(x$settings$dim$K)),
    lbound = bound + lfact,
    iterations = map_dbl(mdl, function(x) length(x$convergence$bound))
    )

Vergleich des statistischen Fits

stm_results %>%
  transmute(
    k,
    `Lower bound` = lbound,
    Residuals = map_dbl(residual, "dispersion"),
    `Semantic coherence` = map_dbl(semantic_coherence, mean),
    `Held-out likelihood` = map_dbl(eval_heldout, "expected.heldout")) %>%
  gather(Metric, Value, -k) %>%
  ggplot(aes(k, Value, color = Metric)) +
    geom_line(size = 1.5, alpha = 0.7, show.legend = FALSE) +
    geom_point(size = 3) +
    scale_x_continuous(breaks = seq(from = 4, to = 20, by = 2)) +
    facet_wrap(~Metric, scales = "free_y") +
    labs(x = "K (Anzahl der Themen)",
         y = NULL,
         title = "Statistischer Fit der STM-Modelle",
         subtitle = "Kohärenz sollte hoch, Residuen niedrig sein"
    ) +
    theme_pubr()

Hohe Kohärenz bei hoher Exklusivität

# Models for comparison
models_for_comparison = c(12, 14, 18)

# Create figures
fig_excl <- stm_results %>% 
  # Edit data
  select(k, exclusivity, semantic_coherence) %>%
  filter(k %in% models_for_comparison) %>%
  unnest(cols = c(exclusivity, semantic_coherence))  %>%
  mutate(k = as.factor(k)) %>%
  # Build graph
  ggplot(aes(semantic_coherence, exclusivity, color = k)) +
    geom_point(size = 2, alpha = 0.7) +
    labs(
      x = "Semantic coherence",
      y = "Exclusivity"
      # title = "Comparing exclusivity and semantic coherence",
      # subtitle = "Models with fewer topics have higher semantic coherence for more topics, but lower exclusivity"
      ) +
      theme_pubr()  

# Create plotly
fig_excl %>% plotly::ggplotly()

Extraktion der Beta- & Gamma-Matrix

# Define model
tpm_k14 <- stm_results %>% 
   filter(k == 14) |> 
   pull(mdl) %>% .[[1]]

# Beta matrix
tpm_k14 %>% 
  tidy(., matrix = "frex")

# A tibble: 154,490 × 2
   topic term                
   <int> <chr>               
 1     1 look_like           
 2     1 look_like_go        
 3     1 like_go             
 4     1 hahahahahahaha      
 5     1 look_like_s         
 6     1 think_go            
 7     1 check_moderator     
 8     1 fact_check_moderator
 9     1 moderator_fact      
10     1 moderator_fact_check
# ℹ 154,480 more rows

# Gamma matrix
tpm_k14 %>% 
  tidy(.,matrix = "gamma", 
    document_names = names(majority_report_chat_stm$documents)
    )

# A tibble: 322,840 × 3
   document                                 topic   gamma
   <chr>                                    <int>   <dbl>
 1 ChwKGkNJR2poT3pVdVlnREZha1FyUVlkblNrWS1B     1 0.0261 
 2 ChwKGkNLbXd3LXpVdVlnREZWd0wxZ0FkYW9FSWdB     1 0.0265 
 3 ChwKGkNKR1RsdV9VdVlnREZkNFhyUVlkZ2d3Tk5n     1 0.0123 
 4 ChwKGkNOQ3kxUExVdVlnREZVS1k1UWNkQ0t3Mlhn     1 0.0200 
 5 ChwKGkNPcW5fZkxVdVlnREZlSFJsQWtkbThZaUtR     1 0.0232 
 6 ChwKGkNNUHZzdlhVdVlnREZha1FyUVlkblNrWS1B     1 0.0236 
 7 ChwKGkNLT1JuX2pVdVlnREZZX0FsQWtkcEw4Wmd3     1 0.434  
 8 ChwKGkNLRElvZmpVdVlnREZaWExGZ2tkTy1ZSXVR     1 0.0118 
 9 ChwKGkNNblNqZm5VdVlnREZhX0l3Z1FkZUg0bHZn     1 0.0356 
10 ChwKGkNMeUkyUHZVdVlnREZXQUhyUVlkTUJvZ193     1 0.00307
# ℹ 322,830 more rows

Extraktion der Top Features nach Thema

# Create gamma data
top_gamma_k14 <- tpm_k14 %>%
  tidy(matrix = "gamma") %>% 
  dplyr::group_by(topic) %>%
  dplyr::summarise(
    gamma = mean(gamma),
    .groups = "drop") %>%
  dplyr::arrange(desc(gamma))

# Create beta data
top_beta_k14 <- tpm_k14 %>%
  tidytext::tidy(.) %>% 
  dplyr::group_by(topic) %>%
  dplyr::arrange(-beta) %>%
  dplyr::top_n(7, wt = beta) %>% 
  dplyr::select(topic, term) %>%
  dplyr::summarise(
    terms_beta = toString(term),
    .groups = "drop")

# Merge gamma & beta data
top_topics_terms_k14 <- top_beta_k14 %>% 
  dplyr::left_join(
    top_gamma_k14, 
    by = "topic") %>%
  dplyr::mutate(
          topic = paste0("Topic ", topic),
          topic = reorder(topic, gamma)
      )

# Create output
top_topics_terms_k14 %>%
  mutate(across(gamma, ~round(.,3))) %>% 
  dplyr::arrange(-gamma) %>% 
  gt() %>% 
  gtExtras::gt_theme_538() %>% 
  gt::tab_options(
    table.width = gt::pct(100), 
    table.font.size = "12px"
    )

topic	terms_beta	gamma
Topic 8	make, 's, lul, emma, fuchsia, liar, kekl	0.115
Topic 7	good, right, now, yes, plan, lie, bad	0.113
Topic 12	kamala, want, biden, eat, take, vote, god	0.109
Topic 5	get, s, wow, mad, omg, thank, nice	0.101
Topic 4	lmao, omegalul, red, green, orange, baby, kekw	0.079
Topic 3	time, sam, love, man, need, old, big	0.075
Topic 11	say, oh, ..., know, look, shit, yeah	0.075
Topic 1	go, like, fact, debate, look, check, keep	0.058
Topic 13	trump, just, donald, lose, racist, win, can	0.056
Topic 9	lol, one, give, ...., wtf, china, okay	0.051
Topic 6	people, think, go, back, work, try, change	0.050
Topic 10	let, talk, ’s, can, like, sound, see	0.045
Topic 2	stop, start, please, israel, use, laugh, agree	0.039
Topic 14	face, guy, don, bring, real, country, rolling_on_the_floor_laughe	0.034

Extraktion & Zusammenführung der Daten

# Prepare for merging
topic_gammas_k14 <- tpm_k14 %>%
  tidy(matrix = "gamma") %>% 
  dplyr::group_by(document) %>% 
  tidyr::pivot_wider(
    id_cols = document, 
    names_from = "topic", 
    names_prefix = "gamma_topic_",
    values_from = "gamma")
      
gammas_k14 <- tpm_k14 %>%
  tidytext::tidy(matrix = "gamma") %>% 
  dplyr::group_by(document) %>% 
  dplyr::slice_max(gamma) %>% 
  dplyr::mutate(
    main_topic = ifelse(
      gamma > 0.5, topic, NA)) %>% 
  rename(
    top_topic = topic,
    top_gamma = gamma) %>% 
  ungroup() %>% 
  left_join(.,
    topic_gammas_k14,
    by = join_by(document))

# Identify empty documents
empty_docs <- Matrix::rowSums(
  as(majority_report_chat_trim, "Matrix")) == 0 
empty_docs_ids <- majority_report_chat_trim@docvars$docname[empty_docs]

# Merge with original data
chats_topics <- chats_filtered %>%
  filter(!(message_id %in% empty_docs_ids)) %>% 
  filter(streamer == "the_majority_report") %>%   
  bind_cols(gammas_k14) %>% 
  select(-document)   

# Preview
chats_topics %>% glimpse

Rows: 23,060
Columns: 50
$ streamer              <chr> "the_majority_report", "the_majority_report", "t…
$ url                   <chr> "https://www.youtube.com/watch?v=lzobJil9Sgc", "…
$ platform              <fct> youtube, youtube, youtube, youtube, youtube, you…
$ debate                <fct> presidential, presidential, presidential, presid…
$ user_name             <chr> "Scott Plant", "Rebecca W", "Galactic News Netwo…
$ user_id               <chr> "UC4mxlnk193JrXVAp6K-vEpQ", "UCeenHJ1v62biyOyKwL…
$ user_display_name     <chr> "Scott Plant", "Rebecca W", "Galactic News Netwo…
$ user_badges           <list> [], [], [], [], [], [], [], [], [], [], [], [],…
$ message_timestamp     <dbl> -152, -151, -145, -138, -137, -132, -126, -126, …
$ message_id            <chr> "ChwKGkNJR2poT3pVdVlnREZha1FyUVlkblNrWS1B", "Chw…
$ message_type          <chr> "text_message", "text_message", "text_message", …
$ message_content       <chr> "Donnie will say, \"That is my own sperm.\"", "w…
$ message_emotes        <list> [], [], [["UCkszU2WH9gy1mb0dV-11UJg/ssIfY7OFG5O…
$ message_length        <int> 40, 45, 52, 38, 10, 32, 8, 14, 2, 90, 20, 36, 20…
$ message_timecode      <Period> -2M -32S, -2M -31S, -2M -25S, -2M -18S, -2M -…
$ message_time          <chr> "23:57:28", "23:57:29", "23:57:35", "23:57:42", …
$ message_during_debate <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_has_badge        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_premium       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_subscriber    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_turbo         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_moderator     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_partner       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_subgifter     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_broadcaster   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_vip           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_twitchdj      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_founder       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_staff         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_game_dev      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_is_ambassador    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_no_audio         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ user_no_video         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ top_topic             <int> 11, 7, 4, 3, 4, 3, 1, 4, 9, 4, 8, 3, 1, 1, 3, 13…
$ top_gamma             <dbl> 0.4435422, 0.3412468, 0.7627751, 0.5663056, 0.46…
$ main_topic            <int> NA, NA, 4, 3, NA, NA, NA, 4, NA, 4, NA, NA, NA, …
$ gamma_topic_1         <dbl> 0.026098022, 0.026450828, 0.012260893, 0.0200424…
$ gamma_topic_2         <dbl> 0.014058480, 0.016014175, 0.006519458, 0.1322111…
$ gamma_topic_3         <dbl> 0.043655546, 0.066001729, 0.018185091, 0.5663056…
$ gamma_topic_4         <dbl> 0.03876696, 0.14976529, 0.76277514, 0.03011074, …
$ gamma_topic_5         <dbl> 0.186801763, 0.043801244, 0.020095565, 0.0373451…
$ gamma_topic_6         <dbl> 0.021470740, 0.024622665, 0.009041711, 0.0174135…
$ gamma_topic_7         <dbl> 0.036282513, 0.341246826, 0.017882159, 0.0289225…
$ gamma_topic_8         <dbl> 0.04538521, 0.14311198, 0.06168206, 0.03558740, …
$ gamma_topic_9         <dbl> 0.021910232, 0.023552979, 0.012143933, 0.0165138…
$ gamma_topic_10        <dbl> 0.020656194, 0.020843309, 0.013562820, 0.0161670…
$ gamma_topic_11        <dbl> 0.443542243, 0.027314995, 0.019249172, 0.0206369…
$ gamma_topic_12        <dbl> 0.044397591, 0.044680183, 0.019709728, 0.0343713…
$ gamma_topic_13        <dbl> 0.027631227, 0.037326873, 0.011473224, 0.0214522…
$ gamma_topic_14        <dbl> 0.029343282, 0.035266919, 0.015419046, 0.0229199…

Themen im Fokus

message_id	user_name	message_time	message_content	top_gamma	top_topic
ChwKGkNKdlRqY1BwdVlnREZRREV3Z1FkV2I4U1hn	David Davis	01:29:52	:watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon:	0.9628609	8
ChwKGkNLMlp3cUxzdVlnREZVN0NsQWtkT0JBRTN3	Jules Winnfeild 🏳️‍⚧️	01:42:09	:face-fuchsia-poop-shape::face-fuchsia-poop-shape::face-fuchsia-poop-shape::face-fuchsia-poop-shape::face-fuchsia-poop-shape::face-fuchsia-poop-shape::face-fuchsia-poop-shape:	0.9583042	8
ChwKGkNNclk0dFBZdVlnREZZYWg1UWNkUlhvNVB3	CanalEduge	00:14:24	:face-fuchsia-poop-shape::face-fuchsia-poop-shape::face-fuchsia-poop-shape::face-fuchsia-poop-shape::face-fuchsia-poop-shape:	0.9437816	8
ChwKGkNJaVdpNVBmdVlnREZTV1Q1UWNkUWg0dEJn	Jules Winnfeild 🏳️‍⚧️	00:43:27	:face-fuchsia-poop-shape::face-fuchsia-poop-shape::face-fuchsia-poop-shape::face-fuchsia-poop-shape:	0.9315330	8
ChwKGkNORG1uTWpvdVlnREZkd3VyUVlkSVFrVU5R	#BobbleHead	01:25:34	WORLDSTAR own's the Trademark on the Algorithm that identified all the Pedophiles = Blame T.M.Z. #ReleaseTheBlackBaby	0.9313871	8
ChwKGkNPV09fYmJwdVlnREZXc3ByUVlkbk9Vc3d3	David Davis	01:29:26	:watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon: :watermelon:	0.9303014	8
ChwKGkNMV244NkRvdVlnREZUMFRyUVlkYmZzUmpB	#BobbleHead	01:24:12	WORLDSTAR own's the Trademark on the Algorithm that identified all the Pedophiles = Blame T.M.Z.	0.9287878	8
ChwKGkNLN0N4ZTdldVlnREZWNDZyUVlkRmZJRklR	Corporations8MyBaby	00:42:10	:face-fuchsia-tongue-out::face-fuchsia-tongue-out::face-fuchsia-tongue-out::face-fuchsia-tongue-out:	0.9269656	8
9c014ab4-89a7-4f9d-97c8-be3da2868f58	nightbot	00:10:47	Join the official Majority Report discord community! https://discord.gg/majority	0.9215614	8
fcb53a8b-4b75-4557-b3eb-d273b7069d88	nightbot	00:26:14	Join the official Majority Report discord community! https://discord.gg/majority	0.9215614	8

Thema 12

chats_topics %>% 
  filter(top_topic == 12) %>% 
  arrange(-top_gamma) %>% 
  slice_head(n = 10) %>% 
  select(message_id, user_name, message_time, message_content, top_gamma, top_topic) %>% 
  gt() %>% 
  gtExtras::gt_theme_538() %>% 
  gt::tab_options(table.font.size = "10px")

message_id	user_name	message_time	message_content	top_gamma	top_topic
ChwKGkNNZXg5LUxxdVlnREZkcVc1UWNkeGpNTDJB	SamSedersLeftTeste	01:35:27	The vice president is BLACK BLACK BLACK BLACK BLACK BLACK	0.9227423	12
ChwKGkNNdVU0NTNndVlnREZkNEwxZ0FkbWxFSFN3	Rilly Kewl	00:48:18	Hey Hey Hey Hey SHUT UP:red_exclamation_mark:	0.9121621	12
ChwKGkNJU1BtS0RndVlnREZRREV3Z1FkV2I4U1hn	Rilly Kewl	00:48:23	Hey Hey Hey Hey SHUT UP:red_exclamation_mark:	0.9121621	12
ChwKGkNJS1NvcVRndVlnREZRMHUxZ0FkU1FFSzZB	Rilly Kewl	00:48:31	Hey Hey Hey Hey SHUT UP:red_exclamation_mark:	0.9121621	12
ChwKGkNLSG9uYUxodVlnREZhY0cxZ0FkSVJjSGdB	Rilly Kewl	00:52:56	Hey Hey Hey Hey SHUT UP:red_exclamation_mark:	0.9121621	12
ChwKGkNPbU90YV9odVlnREZWZ3FyUVlkaUpnNUpn	Rilly Kewl	00:53:23	Hey Hey Hey Hey SHUT UP:red_exclamation_mark:	0.9121621	12
ChwKGkNLYmxuZjdzdVlnREZiMHUxZ0FkT0owN0h3	Rilly Kewl	01:45:21	Hey Hey Hey Hey SHUT UP:red_exclamation_mark:	0.9121621	12
ChwKGkNLX3F4cl90dVlnREZWbzAxZ0FkdzVFTTR3	Rilly Kewl	01:47:38	Hey Hey Hey Hey SHUT UP:red_exclamation_mark:	0.9121621	12
ChwKGkNOYm1yTVB0dVlnREZWb0gxZ0FkQnF3QWRR	Rilly Kewl	01:47:46	Hey Hey Hey Hey SHUT UP:red_exclamation_mark:	0.9121621	12
bad4de96-6c3f-4495-9bd5-da395d9af90b	grandshadowfox	01:07:37	Grandshadowfox subscribed with Prime. They've subscribed for 16 months! 15 months	0.9064436	12

Thema 4

chats_topics %>% 
  filter(top_topic == 4) %>% 
  arrange(-top_gamma) %>% 
  slice_head(n = 10) %>% 
  select(message_id, user_name, message_time, message_content, top_gamma, top_topic) %>% 
  gt() %>% 
  gtExtras::gt_theme_538() %>% 
  gt::tab_options(table.font.size = "10px")

message_id	user_name	message_time	message_content	top_gamma	top_topic
ChwKGkNLX3Z3SzdkdVlnREZhb0NyUVlkVER3aVRn	rhys	00:35:28	:text-green-game-over::text-green-game-over::text-green-game-over::text-green-game-over::text-green-game-over::text-green-game-over:	0.9687024	4
ChwKGkNJN1hsSXJudVlnREZWTGNGZ2tkYnFnYmJB	Jules Winnfeild 🏳️‍⚧️	01:18:56	:fish-orange-wide-eyes::fish-orange-wide-eyes::fish-orange-wide-eyes::fish-orange-wide-eyes::fish-orange-wide-eyes:	0.9649579	4
ChwKGkNJQ3ZzYTdXdVlnREZSek1GZ2tkMndnZ1Bn	fish Monger	00:04:09	ideas:finger-red-number-one::finger-red-number-one::finger-red-number-one::finger-red-number-one::finger-red-number-one::finger-red-number-one::finger-red-number-one::finger-red-number-one:	0.9647715	4
ChwKGkNNcmVxb0RmdVlnREZhb0NyUVlkVER3aVRn	rhys	00:42:48	:text-green-game-over::text-green-game-over::text-green-game-over::text-green-game-over::text-green-game-over:	0.9630820	4
32d36382-5eaf-4da6-a2dc-c9683b98162b	nightbot	00:01:27	Libertarians, call into the show! 646 257-3920. Phones open after 1pm EST. Download the Majority Report app to IM into the show. Go to JoinTheMajorityReport.com to become a member and help support the show.	0.9629138	4
4ffbae78-db39-40e9-bcf8-b5c0965fe2a4	nightbot	00:09:42	Libertarians, call into the show! 646 257-3920. Phones open after 1pm EST. Download the Majority Report app to IM into the show. Go to JoinTheMajorityReport.com to become a member and help support the show.	0.9629138	4
a08570c3-f835-4568-9332-b97bf22ee61b	nightbot	02:01:22	Libertarians, call into the show! 646 257-3920. Phones open after 1pm EST. Download the Majority Report app to IM into the show. Go to JoinTheMajorityReport.com to become a member and help support the show.	0.9629138	4
46b82320-e59d-486e-a58f-acf35b03fe4a	nightbot	02:09:43	Libertarians, call into the show! 646 257-3920. Phones open after 1pm EST. Download the Majority Report app to IM into the show. Go to JoinTheMajorityReport.com to become a member and help support the show.	0.9629138	4
191d1514-cc7e-4a65-8c9e-0ce5d28f1a5d	nightbot	02:22:30	Libertarians, call into the show! 646 257-3920. Phones open after 1pm EST. Download the Majority Report app to IM into the show. Go to JoinTheMajorityReport.com to become a member and help support the show.	0.9629138	4
ed759097-6071-4394-b810-5adafd52f652	nightbot	02:35:23	Libertarians, call into the show! 646 257-3920. Phones open after 1pm EST. Download the Majority Report app to IM into the show. Go to JoinTheMajorityReport.com to become a member and help support the show.	0.9629138	4

User mit den meisten Beiträgen zu Thema 4

chats_topics %>% 
  filter(top_topic == 8) %>% 
  count(user_name, sort = TRUE) %>% 
  mutate(
    prop = round(n/sum(n)*100, 2)) %>% 
  slice_head(n = 10) %>% 
  gt() %>% 
  gtExtras::gt_theme_538()

user_name	n	prop
buuuuuuuuuuuuuuuuuuuuuut	59	1.83
sauvignoncitizen	50	1.55
Say What	49	1.52
Jules Winnfeild 🏳️‍⚧️	47	1.45
asiak	46	1.42
hardradajm	40	1.24
Bob Carmody	34	1.05
T.R.	33	1.02
maj_k1bbles	31	0.96
ogdimwit	31	0.96

Prävalenz vs. Häufigkeit

top_gamma_k14 %>% 
  ggplot(aes(as.factor(topic), gamma)) +
  geom_col(fill = "#F57350") +
  labs(
    x = "Topic",
    y = "Mean gamma"
  ) +
  coord_flip() +
  scale_y_reverse() +
  scale_x_discrete(position = "top") +
  theme_pubr()

chats_topics %>% 
  mutate(across(top_topic, as.factor)) %>% 
  ggplot(aes(top_topic, y = after_stat(prop), group = 1)) +
  geom_bar(fill = "#1DA1F2") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "", 
    y = "Relative frequency"
  ) +
  coord_flip() +
  theme_pubr()

Einfluss von Meta-Variablen

effects <- estimateEffect(
  formula =~ platform + debate + message_during_debate,
  stmobj = tpm_k14, 
  metadata = chats_topics)

summary(effects, topics = 12)


Call:
estimateEffect(formula = ~platform + debate + message_during_debate, 
    stmobj = tpm_k14, metadata = chats_topics)


Topic 12:

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)   
(Intercept)             -0.128058  28.831457  -0.004  0.99646   
platformyoutube          0.214453  28.831531   0.007  0.99407   
debatevice presidential  0.203569  28.831506   0.007  0.99437   
message_during_debate    0.011889   0.004316   2.755  0.00588 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(effects, topics = 8)


Call:
estimateEffect(formula = ~platform + debate + message_during_debate, 
    stmobj = tpm_k14, metadata = chats_topics)


Topic 8:

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)
(Intercept)              0.1234719 29.5028653   0.004    0.997
platformyoutube         -0.0333377 29.5028958  -0.001    0.999
debatevice presidential -0.0134738 29.5028788   0.000    1.000
message_during_debate    0.0006216  0.0044960   0.138    0.890

Background

Preparation

Codechunks aus der Sitzung

Vorverarbeitung der Daten

Vorverarbeitung des Korpus

Add docvars

Fokus on The Majority Report

Estimate models

Vergleich des statistischen Fits

Hohe Kohärenz bei hoher Exklusivität

Extraktion der Beta- & Gamma-Matrix

Extraktion der Top Features nach Thema

Extraktion & Zusammenführung der Daten

Themen im Fokus

Top Topic

Thema 12

Thema 4

User mit den meisten Beiträgen zu Thema 4

Prävalenz vs. Häufigkeit

Einfluss von Meta-Variablen