Processing: Transcripts

Information
  • Processes transcripts of live-streamed debates.
  • Imports, cleans, and transforms the data.
  • Saves the processed data for further analysis.

Preparation

# Load packages
source(file = here::here(
  "data_collection/00_02-setup-session.R"
))
transcripts <- list(
    raw = fs::dir_ls(
        path = here("local_data/transcripts"), 
        glob = "*.txt"
        ) %>%
        # Read the files into the list
        map(~ read_file(.)) %>%
        # Set the list names to the base file names (without the path)
        set_names(~ str_extract(basename(.), "^(.*)(?=\\.txt)"))
)

Process data

Changelog
  • Added extraction of speaker and timestamp from each line of the transcript.
  • Removed brackets from the timestamp.
  • Extracted dialogue text and calculated its length.
  • Filtered out lines without speaker or dialogue.
  • Converted timestamp to hms object and calculated duration between dialogues.
  • Merged all processed files into a single data frame with source identifiers.
  • Added columns for debate type, streamer, and numeric streamer identifier.
  • Created unique speaker identifiers combining debate type, streamer, and speaker.
  • Added sequence_during_debate indicator based on timestamp ranges for each source.
  • Generated unique id_sequence for each speaking sequence.
  • Removed unnecessary columns (line, prefix) and reordered columns.
transcripts$correct <- transcripts$raw %>% 
    # Import and process each file
    map(~ read_file(.) %>%
            str_split("\n") %>%
            .[[1]] %>%
            tibble(line = .) %>%
            mutate(
                speaker = str_extract(line, "S\\d+"),  # Extract speaker 
                timestamp = str_extract(line, "\\[\\d{2}:\\d{2}:\\d{2}\\]"),  # Extract timestamps
                across(timestamp, ~str_remove_all(., "[\\[\\]]")),  # Remove the brackets from timestamp
                dialogue = str_remove(line, "S\\d+ \\[\\d{2}:\\d{2}:\\d{2}\\]: "),  # Extract dialogue text
                dialogue_length = nchar(dialogue),
            ) %>% 
            filter( # Filter out lines without speaker or dialogue
                !is.na(speaker) &
                !is.na(dialogue)
                ) %>% 
            mutate(
                timestamp = hms::as_hms(timestamp),  # Convert timestamp to hms object
                duration = as.numeric(difftime(lead(timestamp), timestamp, , units = "secs"))  # Calculate duration
            )
    ) %>%
    bind_rows(.id = "source") %>% 
    mutate(
        debate = case_when(
            str_detect(source, "vice_presidential") ~ "vice_presidential",
            TRUE ~ "presidential"), 
        streamer = case_when(
            str_detect(source, "abc") | str_detect(source, "cbs") ~ "tv_station",
            str_detect(source, "hasanabi") ~ "hasanabi",
            str_detect(source, "zackrawrr") ~ "zackrawrr",
            str_detect(source, "the_majority_report") ~ "the_majority_report",
            TRUE ~ "unknown"
        ),
        id_streamer = case_when(
            streamer == "tv_station" ~ 1,
            streamer == "hasanabi" ~ 2,
            streamer == "the_majority_report" ~ 3,
            streamer == "zackrawrr" ~ 4,
            TRUE ~ 0
        ),
        prefix = paste0(
            ifelse(debate == "presidential", "p", "vp"),
            id_streamer, "_"
        ),
        id_speaker = paste0(prefix, tolower(speaker)),
        sequence_during_debate = case_when(
            # Presidential debate
            source == "presidential_debate-abc" &
            timestamp >= hms::as_hms("00:00:00") & timestamp <= hms::as_hms("01:45:07") |
            source == "presidential_debate-hasanabi" & 
            timestamp >= hms::as_hms("07:00:11") & timestamp <= hms::as_hms("08:45:21") |
            source == "presidential_debate-zackrawrr" & 
            timestamp >= hms::as_hms("08:02:12") & timestamp <= hms::as_hms("09:46:15") |
            source == "presidential_debate-the_majority_report" & 
            timestamp >= hms::as_hms("00:12:53") & timestamp <= hms::as_hms("01:57:49") |
            # Vice-Presidential debate
            source == "vice_presidential_debate-cbs" &
            timestamp >= hms::as_hms("00:00:00") & timestamp <= hms::as_hms("01:47:48") |
            source == "vice_presidential_debate-hasanabi" & 
            timestamp >= hms::as_hms("06:57:00") & timestamp <= hms::as_hms("08:43:17") |
            source == "presidential_debate-zackrawrr" & 
            timestamp >= hms::as_hms("07:19:26") & timestamp <= hms::as_hms("09:05:41") |
            source == "vice_presidential_debate-the_majority_report" & 
            timestamp >= hms::as_hms("00:09:52") & timestamp <= hms::as_hms("01:57:07")        
             ~ 1,
            TRUE ~ 0
        )
    ) %>% 
    group_by(prefix, id_streamer) %>%
    mutate(id_sequence = paste0(prefix, "s", sprintf("%04d", row_number()))) %>%
    ungroup() %>% 
    relocate(id_sequence) %>% 
    select(-line, -prefix) 
Variable Description
id_sequence Unique identifier for each speaking sequence
source Source of the transcript (e.g., presidential_debate-abc)
speaker Identifier for the speaker (e.g., S27)
timestamp Timestamp of the dialogue in HH:MM:SS format
dialogue Text of the dialogue
dialogue_length Length of the dialogue text in characters
duration Duration of the dialogue in seconds
debate Type of debate (e.g., presidential, vice_presidential)
streamer Source of the stream (e.g., tv_station, hasanabi)
id_streamer Numeric identifier for the streamer
id_speaker Unique identifier for the speaker, combining debate type, streamer, and speaker
sequence_during_debate Indicator if the sequence occurred during the debate (1 = yes, 0 = no)

Save output

qs::qsave(
    transcripts,
    file = here("local_data/transcripts-debates_full.qs")
)

qs::qsave(
    transcripts$correct,
    file = here("local_data/transcripts-debates.qs")
)