Mining: Vice-Presidential Debate

Vance vs. Walz - 02.10.2024

Information

This document outlines the process of collecting live chat data from Twitch VODs of the Vice-Presidential debate held on 02.10.2024. The steps taken include:

Setting up the R and Python environments, including installing necessary packages.
Authenticating with Twitch using the twitchr package.
Preparing a list of VOD URLs to be processed.
Downloading the VODs using the twitch-dl tool.
Collecting live chat messages from the VODs using the chat_downloader Python package.
Converting the collected chat data into a pandas DataFrame and then into an R tibble.
Exporting the collected chat data to a local file for further analysis.

Preparation

# Setup R environment
pacman::p_load(
    here, fs, 
    twitchr,
    tidyverse,
    reticulate
)

# Setup Python environment
import datetime
import email, smtplib, ssl
import pandas as pd
import rpy2.robjects as robjects
import twitchdl

from chat_downloader import ChatDownloader
from chat_downloader.sites import TwitchChatDownloader

# twitch authorization
twitchr::twitch_auth()

Prepare list of VODs

debate_vods_urls <- c(
    # Twitch
    "https://www.twitch.tv/videos/2265091277", #hasanabi
    "https://www.twitch.tv/videos/2265091311", #zackrawrr
    "https://www.twitch.tv/videos/2265413840" # Majority Report Live
)

s ## Prepare the download links

# HasanAbi
twitch-dl download https://www.twitch.tv/videos/2265091277 --quality 720p30
# zackrawrr
twitch-dl download https://www.twitch.tv/videos/2265091311 --quality 720p60
# The Majority Report
twitch-dl download https://www.twitch.tv/videos/2265413840 --quality 720p30

Collect live chat

# Assuming url_py is already a Python list of URLs from R
url_py = list(robjects.globalenv['debate_vods_urls'])

# Initialize the ChatDownloader
chat_downloader = ChatDownloader()

# Initialize an empty list to store message data
message_list = []

# Function to generate a unique stream ID (can be URL or index-based)
def generate_stream_id(url, index):
    return f"stream_{index+1}"

# Debugging: Print the list of URLs
print("URLs to process:", url_py)

# Loop through each URL and download the chat
for idx, url in enumerate(url_py):
    try:
        print(f"Processing URL: {url}")
        
        # Fetch chat
        chat = chat_downloader.get_chat(url)
        if not chat:
            print(f"No chat data found for {url}")
            continue  # Skip to the next URL if no chat found

        stream_id = generate_stream_id(url, idx)  # Generate a unique stream ID
        
        print(f"Downloading chat for {url}")
        
        for message in chat:
            # Log message info for debugging
            
            # Extract message details
            message_content = message.get('message', '')
            message_id = message.get('message_id', None)
            message_type = message.get('message_type', 'None')
            timestamp = message.get('time_in_seconds', None)

            # Extract author details (ensure the author field exists)
            author_info = message.get('author', {})  # Unpack dictionary with author info
            author_id = author_info.get('id', 'NA')  # Extract author ID
            author_name = author_info.get('name', 'NA')  # Extract author name
            author_type = author_info.get('type', 'NA')  # Extract type of author
            author_gender = author_info.get('gender', 'NA')  # Extract gender of the author
            author_bot = author_info.get('is_bot', 'NA')  # True if the user is a bot, False otherwise.
            author_poster = author_info.get('is_original_poster', 'NA')  # True if the user is the original poster, False otherwise.
            author_verified = author_info.get('is_verified', 'NA')  # True if the user is verified, False otherwise.
            author_moderator = author_info.get('is_moderator', 'NA')  # True if the user is a moderator, False otherwise.
            author_subscriber = author_info.get('is_subscriber', 'NA')  # True if the user is a subscriber, False otherwise.
            display_name = author_info.get('display_name', author_name)  # Extract display name
            badges = author_info.get('badges', [])  # Keep badges as a list
            emotes = message.get('emotes', [])  # Keep emotes as a list

            # Create a dictionary representing one row of the tibble
            message_with_info = {
                'stream_id': stream_id,  # Add the stream ID
                'url': url,  # Add the stream URL
                'username': author_name,  
                'user_id': author_id,  
                'display_name': display_name,  
                'user_type': author_type, 
                'user_gender': author_gender,
                'user_is_bot': author_bot,
                'user_is_original_poster': author_poster,
                'user_is_verified': author_verified,
                'user_is_moderator': author_moderator,
                'user_is_subscriber': author_subscriber,
                'badges': badges,  # Add the badges as a list
                'emotes': emotes,  # Add the emotes as a list
                'timestamp': timestamp,  # Add the message timestamp
                'message_id': message_id,  # Add the message ID
                'message_type': message_type,  # Add the message type
                'message_content': message_content  # Add the actual message text
            }

            # Append the dictionary to the list
            message_list.append(message_with_info)
                
    except Exception as e:
        print(f"Error processing {url}: {e}")

# Print the final list of messages collected
print("Collection finished")

# Convert the list of dictionaries to a pandas DataFrame
message_df = pd.DataFrame(message_list)

# Access the message_df from Python
df <- py$message_df %>% 
    as_tibble()

# Check the structure of the tibble
df %>% glimpse

# Name of subdirectory for easier path managment
project_dir <- here::here("2024-nlp_of_live_stream_chat")

qs::qsave(df, file = here(project_dir, "local_data/chat_raw-vods_vice_presidential_debate.qs"))