if (!require("pacman")) install.packages("pacman")
pacman::p_load(
here,
magrittr, janitor,
ggpubr,
gt, gtExtras,
countdown,
quanteda, # quanteda text processing
quanteda.textplots, quanteda.textstats, quanteda.textmodels,
udpipe, spacyr, # POS tagging
easystats, tidyverse
)🔨 Advanced Methods
Tutorial - Session 08
Background
Preparation
# Import base data
chats <- qs::qread(here("local_data/chat-debates_full.qs"))$correct
transcripts <- qs::qread(here("local_data/transcripts-debates_full.qs"))$correct
dict_chat_emotes <- readRDS(here("local_data/dictionary_chat_emotes.RDS"))
# Import corpora
transcripts_udpipe <- qs::qread(here("local_data/transcripts-corpus_udpipe.qs"))
transcripts_spacyr <- qs::qread(here("local_data/transcripts-corpus_spacyr.qs"))
transcripts_pos <- transcripts_udpipeCodechunks aus der Sitzung
Erstellung der Datengrundlage
# Create corpus
corp_transcripts <- transcripts %>%
quanteda::corpus(
docid_field = "id_sequence",
text_field = "dialogue"
)
# Tokenize corpus
toks_transcripts <- corp_transcripts %>%
quanteda::tokens(
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
split_hyphens = FALSE,
split_tags = FALSE
) %>%
quanteda::tokens_remove(
pattern = quanteda::stopwords("en")
)
# Add n_grams
toks_transcripts_ngrams <- toks_transcripts %>%
quanteda::tokens_ngrams(n = 1:3)
# Convert to DFM
dfm_transcripts <- toks_transcripts_ngrams %>%
quanteda::dfm()# Create corpus
corp_chats <- chats %>%
quanteda::corpus(docid_field = "message_id", text_field = "message_content")
# Tokenize corpus
toks_chats <- corp_chats %>% quanteda::tokens()
# Convert to DFM
dfm_chats <- toks_chats %>% quanteda::dfm()Ngrams: Sequenzen von N aufeinanderfolgenden Token
toks_transcripts %>%
quanteda::tokens_ngrams(n = 2) %>%
quanteda::dfm() %>%
quanteda.textstats::textstat_frequency() %>%
head(25) feature frequency rank docfreq group
1 know_know 1337 1 49 all
2 t-mobile_t-mobile 864 2 6 all
3 donald_trump 755 3 461 all
4 going_say 666 4 30 all
5 say_going 661 5 35 all
6 saying_bad 558 6 4 all
7 bad_saying 553 7 3 all
8 kamala_harris 494 8 333 all
9 vice_president 429 9 376 all
10 curious_curious 373 10 7 all
11 sekunden_pause 354 11 266 all
12 right_now 269 12 234 all
13 united_states 268 13 211 all
14 feel_like 230 14 164 all
15 oh_oh 229 15 10 all
16 like_know 208 16 133 all
17 president_trump 203 17 178 all
18 like_like 191 18 144 all
19 president_harris 186 19 179 all
20 lot_people 181 20 138 all
21 know_like 168 21 109 all
22 american_people 163 22 118 all
23 oh_god 154 23 139 all
24 just_like 153 24 129 all
25 former_president 141 25 115 all
Kollokationen: Identifikation von bedeutungsvollen Wortkombinationen
toks_transcripts %>%
quanteda.textstats::textstat_collocations(
size = 2,
min_count = 5
) %>%
head(25) collocation count count_nested length lambda z
1 know know 1337 0 2 3.890787 98.31370
2 saying bad 558 0 2 6.503305 81.19215
3 bad saying 553 0 2 6.481795 80.98625
4 going say 666 0 2 4.555737 80.92246
5 say going 661 0 2 4.562963 80.82524
6 donald trump 755 0 2 7.422847 75.19267
7 kamala harris 494 0 2 7.873933 68.88003
8 vice president 429 0 2 7.258104 56.72753
9 oh oh 229 0 2 4.679549 54.95066
10 right now 269 0 2 3.996328 53.27167
11 senator vance 129 0 2 6.583164 49.48173
12 little bit 132 0 2 8.268099 45.80914
13 president harris 186 0 2 4.027681 45.69845
14 oh god 154 0 2 6.175930 44.94423
15 years ago 102 0 2 6.436807 43.06424
16 tim walz 90 0 2 7.588398 43.05596
17 president trump 203 0 2 3.468589 42.70406
18 four years 100 0 2 6.381600 42.53221
19 health care 136 0 2 7.400206 41.83123
20 white house 85 0 2 8.071701 41.52827
21 donald trump's 132 0 2 6.408658 41.05938
22 former president 141 0 2 5.790480 41.04486
23 curious curious 373 0 2 11.836611 40.77202
24 governor walz 77 0 2 6.512020 39.65499
25 two minutes 84 0 2 6.490910 39.39074
Arbeiten mit quanteda: corpus
# Create corpus
corp_transcripts <- transcripts %>%
quanteda::corpus(
docid_field = "id_sequence",
text_field = "dialogue"
)
# Output
corp_transcriptsCorpus consisting of 5,861 documents and 10 docvars.
p1_s0001 :
"Tonight, the high-stakes showdown here in Philadelphia betwe..."
p1_s0002 :
"A historic race for president upended just weeks ago, Presid..."
p1_s0003 :
"The candidates separated by the smallest of margins, essenti..."
p1_s0004 :
"This is an ABC News special. The most consequential moment o..."
p1_s0005 :
"Together, we'll chart a... (..)"
p1_s0006 :
"Donald Trump."
[ reached max_ndoc ... 5,855 more documents ]
Keywords-in-Context (KWIC)
Unmittelbarer Wortkontext ohne statistische Gewichtung
toks_transcripts %>%
kwic("know", window = 3) %>%
head(10)Keyword-in-context with 10 matches.
[p1_s0018, 29] opportunity economy thing | know | shortage homes housing
[p1_s0018, 39] far many people | know | young families need
[p1_s0020, 25] billions billions dollars | know | China fact never
[p1_s0022, 44] done intend build | know | aspirations hopes American
[p1_s0024, 2] nothing | know | knows better anyone
[p1_s0025, 1] | know | everybody else Vice
[p1_s0026, 64] stand issues invite | know | Donald Trump actually
[p1_s0028, 38] goods coming country | know | many economists say
[p1_s0029, 24] billions dollars countries | know | like gone immediately
[p1_s0031, 90] Thank President Xi | know | Xi responsible lacking
Einsatz zur Qualitätskontrolle
toks_transcripts %>%
kwic(
phrase("know know"),
window = 3) %>%
tibble() %>%
select(-pattern) %>%
slice(35:45) %>%
gt() %>%
gtExtras::gt_theme_538() %>%
gt::tab_options(
table.width = gt::pct(100),
table.font.size = "10px"
)| docname | from | to | pre | keyword | post |
|---|---|---|---|---|---|
| vp2_s0723 | 94 | 95 | kiss just kiss | know know | just kiss kiss |
| vp2_s0732 | 119 | 120 | default press even | know know | difference campaign strategy |
| vp3_s0151 | 32 | 33 | cop able assess | know know | J.D Vance lying |
| vp3_s0332 | 3 | 4 | really mean | know know | many people tune |
| vp3_s0332 | 116 | 117 | generous Sekunden Pause | know know | type like know |
| vp3_s0332 | 120 | 121 | know type like | know know | type like know |
| vp3_s0332 | 124 | 125 | know type like | know know | know know know |
| vp3_s0332 | 125 | 126 | type like know | know know | know know know |
| vp3_s0332 | 126 | 127 | like know know | know know | know know know |
| vp3_s0332 | 127 | 128 | know know know | know know | know know know |
| vp3_s0332 | 128 | 129 | know know know | know know | know know know |
Ngrams als Features definieren
# Definition von Features
custom_ngrams <- c("donald trump", "joe biden", "kamala harris")
# Anwendung auf DFM
dfm_with_custom_ngrams <- toks_transcripts %>%
tokens_compound(pattern = phrase(custom_ngrams)) %>%
dfm() %>%
dfm_trim(min_docfreq = 0.005, max_docfreq = 0.99, docfreq_type = "prop")
# Überprüfung
dfm_with_custom_ngrams %>%
convert(to = "data.frame") %>%
select(doc_id, starts_with("donald")) %>%
head() doc_id donald_trump donald
1 p1_s0001 1 0
2 p1_s0002 1 0
3 p1_s0003 0 0
4 p1_s0004 0 0
5 p1_s0005 0 0
6 p1_s0006 1 0
Semantische Netzwerke: Visualisierung von Tokenbeziehungen
# Lookup emotes in DFM of chats
dfm_emotes <- dfm_chats %>%
quanteda::dfm_lookup(
dictionary = dict_chat_emotes)
# Output frequency of emojis
top50_emotes <- dfm_emotes %>%
topfeatures(50) %>%
names()
# Visualize
dfm_emotes %>%
fcm() %>%
fcm_select(pattern = top50_emotes) %>%
textplot_network()
POS-Tagging & Dependency Parsing
udmodel <- udpipe::udpipe_download_model(language = "english")
transcripts_pos <- transcripts %>%
rename(doc_id = id_sequence, text = dialogue) %>%
udpipe::udpipe(udmodel)transcripts_pos %>%
select(doc_id, sentence_id, token_id, token, head_token_id, lemma, upos, xpos) %>%
head(n = 7) %>%
gt() %>% gtExtras::gt_theme_538() %>%
gt::tab_options(table.width = gt::pct(100), table.font.size = "12px")| doc_id | sentence_id | token_id | token | head_token_id | lemma | upos | xpos |
|---|---|---|---|---|---|---|---|
| p1_s0001 | 1 | 1 | Tonight | 0 | tonight | NOUN | NN |
| p1_s0001 | 1 | 2 | , | 1 | , | PUNCT | , |
| p1_s0001 | 1 | 3 | the | 7 | the | DET | DT |
| p1_s0001 | 1 | 4 | high | 6 | high | ADJ | JJ |
| p1_s0001 | 1 | 5 | - | 6 | - | PUNCT | HYPH |
| p1_s0001 | 1 | 6 | stakes | 7 | stake | NOUN | NNS |
| p1_s0001 | 1 | 7 | showdown | 1 | showdown | NOUN | NN |
Mit welchen Wörtern wird Trump beschrieben?
transcripts_pos %>%
filter(
upos == "NOUN" &
lemma == "trump") %>%
inner_join(
transcripts_pos,
by = c(
"doc_id",
"sentence_id"),
relationship =
"many-to-many") %>%
filter(
upos.y == "ADJ" &
head_token_id.y == token_id.x) %>%
rename(
token_id = token_id.y,
token = token.y) %>%
select(
doc_id, sentence_id,
token_id, token) %>%
sjmisc::frq(token, sort.frq = "desc") token <character>
# total N=161 valid N=161 mean=3.72 sd=4.67
Value | N | Raw % | Valid % | Cum. %
---------------------------------------------
donald | 132 | 81.99 | 81.99 | 81.99
Donald | 4 | 2.48 | 2.48 | 84.47
um | 3 | 1.86 | 1.86 | 86.34
former | 2 | 1.24 | 1.24 | 87.58
narcissistic | 2 | 1.24 | 1.24 | 88.82
bad | 1 | 0.62 | 0.62 | 89.44
good | 1 | 0.62 | 0.62 | 90.06
great | 1 | 0.62 | 0.62 | 90.68
iran | 1 | 0.62 | 0.62 | 91.30
laura | 1 | 0.62 | 0.62 | 91.93
much | 1 | 0.62 | 0.62 | 92.55
okay | 1 | 0.62 | 0.62 | 93.17
other | 1 | 0.62 | 0.62 | 93.79
past | 1 | 0.62 | 0.62 | 94.41
Said | 1 | 0.62 | 0.62 | 95.03
selfish | 1 | 0.62 | 0.62 | 95.65
social | 1 | 0.62 | 0.62 | 96.27
tighter | 1 | 0.62 | 0.62 | 96.89
total | 1 | 0.62 | 0.62 | 97.52
unfit | 1 | 0.62 | 0.62 | 98.14
unseat | 1 | 0.62 | 0.62 | 98.76
weaker | 1 | 0.62 | 0.62 | 99.38
weird | 1 | 0.62 | 0.62 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
transcripts_spacyr %>%
filter(
pos == "NOUN" &
lemma == "trump") %>%
inner_join(
transcripts_spacyr,
by = c(
"doc_id",
"sentence_id"),
relationship =
"many-to-many") %>%
filter(
pos.y == "ADJ" &
head_token_id.y == token_id.x) %>%
rename(
token_id = token_id.y,
token = token.y) %>%
select(
doc_id, sentence_id,
token_id, token) %>%
sjmisc::frq(token, sort.frq = "desc") token <character>
# total N=10 valid N=10 mean=5.40 sd=2.88
Value | N | Raw % | Valid % | Cum. %
-------------------------------------------
unfit | 2 | 20 | 20 | 20
bad | 1 | 10 | 10 | 30
donald | 1 | 10 | 10 | 40
fucking | 1 | 10 | 10 | 50
narcissistic | 1 | 10 | 10 | 60
other | 1 | 10 | 10 | 70
same | 1 | 10 | 10 | 80
tighter | 1 | 10 | 10 | 90
total | 1 | 10 | 10 | 100
<NA> | 0 | 0 | <NA> | <NA>