Corpus: Chats

Information

Based on the chat data, this script: - Creates a corpus, tokens, and a document-feature matrix with the quanteda package [v4.1.0, @benoit2018]. - Utilizes udpipe [v0.8.11, @wijffels2023] and spacyr [v1.3.0, @benoit2023] packages for additional linguistic processing, adding lemmatization, part-of-speech tagging, and named entity recognition.

Preparation

# Load packages
source(file = here::here(
  "data_collection/00_02-setup-session.R"
))

chat <- qs::qread(here("local_data/chat-debates_full.qs"))
chat_corpora <- qs::qread(here("local_data/chat-corpora_full.qs"))

Process data

chat_corpora <- list()

# Create corpus
chat_corpora$corp <- chat$hashed %>% 
    quanteda::corpus(
        docid_field = "message_id", 
        text_field = "message_content"
  )

# Create tokens
chat_corpora$toks <- chat_corpora$corp %>% 
    quanteda::tokens() 

# Create Document Feature Matrix (DFM)
chat_corpora$dfm <- chat_corpora$toks %>% 
    quanteda::dfm()

# Execute on first run, to download the model 
# udmodel <- udpipe::udpipe_download_model(
#     language = "english",
#     model_dir = here("models"))

# Load udpipe model
udmodel_english <- udpipe::udpipe_load_model(file = here("models/english-ewt-ud-2.5-191206.udpipe"))

chat_corpora$udpipe <- chat$hashed %>% 
  rename(
    doc_id = message_id,
    text = message_content
  ) %>% 
  udpipe::udpipe(udmodel_english)

# Define environment
reticulate::use_virtualenv("r-spacyr")

# Initialize
spacyr::spacy_download_langmodel("en_core_web_sm")
spacyr::spacy_initialize("en_core_web_sm")

chat_corpora$spacyr <- chat_corpora$corp %>% 
    spacyr::spacy_parse(.,
        tag = TRUE,
        pos = TRUE,
        lemma = TRUE,
        dependency = TRUE,
        multithread = TRUE
    )

Save data

# Save complete data
qs::qsave(
    chat_corpora,
    file = here("local_data/chat-corpora_full.qs")
)

# Save udpipe corpus
qs::qsave(
    chat_corpora$udpipe, 
    file = here("local_data/chat-corpus_udpipe.qs")
)

# Save spacyr corpus
qs::qsave(
    chat_corpora$spacyr, 
    file = here("local_data/chat-corpus_spacyr.qs")
)