# Load packages
source(file = here::here(
"data_collection/00_02-setup-session.R"
))Corpus: Transcripts
Information
Based on the transcript data, this script:
- Creates a corpus, tokens, and a document-feature matrix with the
quantedapackage (v4.1.0, Benoit et al. 2018). - Utilizes
udpipe(v0.8.11, Wijffels 2023) andspacyr(v1.3.0, Benoit and Matsuo 2023) packages for additional linguistic processing, adding lemmatization, part-of-speech tagging, and named entity recognition.
Preparation
transcripts <- qs::qread(here("local_data/transcripts-debates_full.qs"))Process data
transcripts_corpora <- list()
# Create corpus
transcripts_corpora$corp <- transcripts$hashed %>%
quanteda::corpus(
docid_field = "id_sequence",
text_field = "dialogue"
)
# Create tokens
transcripts_corpora$toks <- transcripts_corpora$corp %>%
quanteda::tokens(
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
split_hyphens = FALSE,
split_tags = FALSE
) %>%
quanteda::tokens_remove(
pattern = quanteda::stopwords("en")
)
# Create Document Feature Matrix (DFM)
transcripts_corpora$dfm <- transcripts_corpora$toks %>%
quanteda::dfm()# Execute on first run, to download the model
# udmodel <- udpipe::udpipe_download_model(
# language = "english",
# model_dir = here("models"))
# Load udpipe model
udmodel_english <- udpipe::udpipe_load_model(file = here("models/english-ewt-ud-2.5-191206.udpipe"))
transcripts_corpora$udpipe <- transcripts$correct %>%
rename(
doc_id = id_sequence,
text = dialogue
) %>%
udpipe::udpipe(udmodel_english)# Define environment
reticulate::use_virtualenv("r-spacyr")
# Initialize
# spacyr::spacy_download_langmodel("en_core_web_sm", force = TRUE)
spacyr::spacy_initialize("en_core_web_sm")
# Parse text
transcripts_corpora$spacyr <- transcripts_corpora$corp %>%
spacyr::spacy_parse(.,
tag = TRUE,
pos = TRUE,
lemma = TRUE,
entity = TRUE,
dependency = TRUE,
nounphrase = TRUE,
multithread = TRUE,
additional_attributes = c(
"is_punct"
)
)Save data
# Save complete data
qs::qsave(
transcripts_corpora,
file = here("local_data/transcripts-corpora_full.qs")
)
# Save udpipe corpus
qs::qsave(
transcripts_corpora$udpipe,
file = here("local_data/transcripts-corpus_udpipe.qs")
)
# Save spacyr corpus
qs::qsave(
transcripts_corpora$spacyr,
file = here("local_data/transcripts-corpus_spacyr.qs")
)References
Benoit, Kenneth, and Akitaka Matsuo. 2023. Spacyr: Wrapper to the ’spaCy’ ’NLP’ Library. https://spacyr.quanteda.io.
Benoit, Kenneth, Kohei Watanabe, Haiyan Wang, Paul Nulty, Adam Obeng, Stefan Müller, and Akitaka Matsuo. 2018. “Quanteda: An r Package for the Quantitative Analysis of Textual Data.” Journal of Open Source Software 3 (30): 774. https://doi.org/10.21105/joss.00774.
Wijffels, Jan. 2023. Udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the ’UDPipe’ ’NLP’ Toolkit. https://CRAN.R-project.org/package=udpipe.