Showcase

Focus on Twitter API v2

Open this showcase in other interactive and executable environments:

Binder RStudio Binder Google Colab

Background

Practical application of the Twitter Academic Research Product Track v2 API endpoint with the help of the academictwitteR (Barrie & Ho, 2021) package. Visit the repository of the package for further information.

This version of the Twitter API allows researchers to access larger volumes of Twitter data. For more information on the the Twitter API, including how to apply for access to the Academic Research Product Track, see the Twitter Developer platform.

Disclaimer

This session was created before Twitter announced changes to its API access:

As things stand (28.02.2023), there will no longer be free academic research.

This showcase contains two exemplary uses of the API: Analysis of specific hashtags (e.g. #Karneval) and/or specific accounts (e.g. @elonmusk)

Preparation

Load necessary packages

library(here) # Easy file path construction
library(academictwitteR) # Collecting the data
library(lubridate) # Work with date-times and time-spans
library(sjmisc) # Collection of miscellaneous utility functions
library(tidyverse) # Preparation of the data
library(quanteda) # Text mining
library(quanteda.textstats) # Text statistics
library(quanteda.textplots) # Visualisation of text data
library(ggthemes) # Custom ggplot themes
library(ggpubr) # Convenience functions for 'ggplot2'-plots

Set personal bearer token

personal_bearer_token <- "INSERT BEARER TOKEN HERE"

Mining tweets: hashtag(s)

Data collection

get_all_tweets(
    query = "#Karneval", 
    start_tweets = "2022-11-11T00:00:00Z",
    end_tweets = "2022-11-13T12:00:00Z",
    file = "karneval",
    data_path = "data.local/raw_karneval/",
    n = 100000,
    #bearer_token = personal_bearer_token
  )

Read data from disc

tweets_karneval <- bind_tweets(
  data_path = here("content/04-api_access-twitter/data.local/raw_karneval"),
  output_format = "tidy") %>% 
    mutate(
    datetime = ymd_hms(created_at),
    date = date(datetime),
    hour = hour(datetime),
    min  = minute(datetime),
    hms  = hms::as_hms(datetime),
    hm   = hms::parse_hm(hms)
  ) 

# Anonymization of potentially sensitive information
tweets_karneval_hash <- tweets_karneval %>% 
  mutate(
     across(c(
       tweet_id:text,
       author_id, conversation_id, in_reply_to_user_id,
       starts_with("user_"),
       starts_with("sourcetweet_")), 
       ~ v_digest(.)))

Data analysis

Overview of dataset

tweets_karneval_hash %>% 
       glimpse()
Rows: 2,637
Columns: 37
$ tweet_id               <chr> "6748786565d7973caa87d2323915aa88", "54d163369a…
$ user_username          <chr> "378ed57a6433a1b3b40cb09889737b6c", "b3311d2c36…
$ text                   <chr> "082080d7671a3e96771128abe2d4d97a", "50a5d056f6…
$ created_at             <chr> "2022-11-11T08:15:08.000Z", "2022-11-11T08:14:5…
$ lang                   <chr> "de", "de", "de", "und", "und", "und", "und", "…
$ possibly_sensitive     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE,…
$ author_id              <chr> "53c03c88c563bbe0c846ec72fbb009de", "455d0e5e38…
$ source                 <chr> "Twitter for Android", "Twitter for Android", "…
$ conversation_id        <chr> "6748786565d7973caa87d2323915aa88", "54d163369a…
$ in_reply_to_user_id    <chr> "55611e71b358a30158c61810ad802435", "55611e71b3…
$ user_protected         <chr> "72f531d1a36ddd6b5cbc433c29147817", "72f531d1a3…
$ user_description       <chr> "bafdc8d3fd80275db42115ae8e29b4b5", "9a7ef75623…
$ user_profile_image_url <chr> "f081768e89c22cd6541e544249f8bce0", "3abb25c65c…
$ user_pinned_tweet_id   <chr> "717229259056b4b0bc0ff08c60026525", "a70a399cbc…
$ user_verified          <chr> "72f531d1a36ddd6b5cbc433c29147817", "72f531d1a3…
$ user_name              <chr> "77e487e1cca06b726ed5d9dd402c7e3f", "4f1c874adf…
$ user_created_at        <chr> "963fa1ac02702aff4beae1184d6dc4e9", "4d26ba5d1b…
$ user_location          <chr> "55611e71b358a30158c61810ad802435", "27ce7d3e06…
$ user_url               <chr> "55611e71b358a30158c61810ad802435", "a71a2cbce2…
$ retweet_count          <int> 2, 0, 0, 178, 178, 178, 178, 178, 178, 178, 178…
$ like_count             <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ quote_count            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ user_tweet_count       <chr> "6491b568f945b160a838429d9e1ac780", "5741da7b11…
$ user_list_count        <chr> "1473d70e5646a26de3c52aa1abd85b1f", "1473d70e56…
$ user_followers_count   <chr> "635d85ebc32e43cadb00fe448947d969", "28a5de8169…
$ user_following_count   <chr> "77bd932347d1b80af13380a630d5fdad", "dc1a545561…
$ sourcetweet_type       <chr> "1598fc1c239906e1e10709bc12e5f0f3", "55611e71b3…
$ sourcetweet_id         <chr> "549605b6493750e7a3bd445969dd53e4", "55611e71b3…
$ sourcetweet_text       <chr> "261331dce22c0a8cfe03d52e17966a92", "55611e71b3…
$ sourcetweet_lang       <chr> "8310e591706d1e38cdbfd4e26f17a274", "55611e71b3…
$ sourcetweet_author_id  <chr> "b7b3a7c1d81f9f456dd6f2a707c44e65", "55611e71b3…
$ datetime               <dttm> 2022-11-11 08:15:08, 2022-11-11 08:14:57, 2022…
$ date                   <date> 2022-11-11, 2022-11-11, 2022-11-11, 2022-11-11…
$ hour                   <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,…
$ min                    <int> 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 12,…
$ hms                    <time> 08:15:08, 08:14:57, 08:14:29, 08:14:29, 08:13:…
$ hm                     <time> 08:15:00, 08:14:00, 08:14:00, 08:14:00, 08:13:…

Language of tweets

frq(tweets_karneval$lang, sort.frq = "desc")
x <character> 
# total N=2637 valid N=2637 mean=6.85 sd=5.06

Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
de    | 2180 | 82.67 |   82.67 |  82.67
und   |  201 |  7.62 |    7.62 |  90.29
en    |  112 |  4.25 |    4.25 |  94.54
qme   |   43 |  1.63 |    1.63 |  96.17
es    |   18 |  0.68 |    0.68 |  96.85
tr    |   10 |  0.38 |    0.38 |  97.23
fr    |    9 |  0.34 |    0.34 |  97.57
nl    |    9 |  0.34 |    0.34 |  97.91
pl    |    8 |  0.30 |    0.30 |  98.22
in    |    7 |  0.27 |    0.27 |  98.48
no    |    7 |  0.27 |    0.27 |  98.75
qht   |    6 |  0.23 |    0.23 |  98.98
ca    |    5 |  0.19 |    0.19 |  99.17
da    |    4 |  0.15 |    0.15 |  99.32
cs    |    3 |  0.11 |    0.11 |  99.43
eu    |    3 |  0.11 |    0.11 |  99.54
fi    |    3 |  0.11 |    0.11 |  99.66
cy    |    2 |  0.08 |    0.08 |  99.73
et    |    2 |  0.08 |    0.08 |  99.81
ja    |    2 |  0.08 |    0.08 |  99.89
lt    |    2 |  0.08 |    0.08 |  99.96
is    |    1 |  0.04 |    0.04 | 100.00
<NA>  |    0 |  0.00 |    <NA> |   <NA>

Tweets over time

tweets_karneval %>% 
  ggplot(aes(hour)) +
  geom_bar() +
  facet_grid(cols = vars(date)) +
  theme_pubr()

Most frequent time (HH:MM) of sending tweets

tweets_karneval %>%
  frq(hm,
      sort.frq = "desc", 
      min.frq = 10)
hm <numeric> 
# total N=2637 valid N=2637 mean=48463.12 sd=16629.59

Value    |    N | Raw % | Valid % | Cum. %
------------------------------------------
10:11:00 |   59 |  2.24 |    2.24 |   2.24
10:12:00 |   14 |  0.53 |    0.53 |   2.77
10:13:00 |   12 |  0.46 |    0.46 |   3.22
16:54:00 |   12 |  0.46 |    0.46 |   3.68
10:10:00 |   10 |  0.38 |    0.38 |   4.06
11:29:00 |   10 |  0.38 |    0.38 |   4.44
16:53:00 |   10 |  0.38 |    0.38 |   4.82
n < 10   | 2510 | 95.18 |   95.18 | 100.00
<NA>     |    0 |  0.00 |    <NA> |   <NA>

User with the most tweets

tweets_karneval_hash %>% 
  frq(user_username,
      sort.frq = "desc", 
      min.frq = 5)
user_username <character> 
# total N=2637 valid N=2637 mean=996.15 sd=563.80

Value                            |    N | Raw % | Valid % | Cum. %
------------------------------------------------------------------
93b5f1c7a40ccc0c2acab3c32ba64e17 |   65 |  2.46 |    2.46 |   2.46
882fb98901749d154801b72be73612cd |   28 |  1.06 |    1.06 |   3.53
d2d15939aa0af671652046a62c7033dd |   19 |  0.72 |    0.72 |   4.25
43577471f043fc5a1deee9c789576df1 |   16 |  0.61 |    0.61 |   4.85
dbc086d9f5424d290ab7f028e7079434 |   15 |  0.57 |    0.57 |   5.42
460279057116f1c75027e3e0577b823a |   12 |  0.46 |    0.46 |   5.88
fb5b9a76332c82bae2edb40fa6ce45a2 |   12 |  0.46 |    0.46 |   6.33
798e8ead31e9ef06e637ba6753230b47 |   11 |  0.42 |    0.42 |   6.75
7cc9af9b177dc7dbb28261fe354f03dc |   10 |  0.38 |    0.38 |   7.13
ad63ff146a7453e17d3c0f641fb90668 |   10 |  0.38 |    0.38 |   7.51
dd4f7411a4e96c329645e17b8b86bde7 |   10 |  0.38 |    0.38 |   7.89
b98e9e94b335b813869e7e4f42b0e246 |    9 |  0.34 |    0.34 |   8.23
b906c56c87dd80721038f968fcae1f1c |    8 |  0.30 |    0.30 |   8.53
fc47c905f214127da9143747496b2791 |    8 |  0.30 |    0.30 |   8.84
19aeeeb4704c67bdfd12488918b187a8 |    7 |  0.27 |    0.27 |   9.10
5260a68437fcfa5933651c3bb9c476de |    7 |  0.27 |    0.27 |   9.37
d32b6bf0f04cf5761a33600491a03cb9 |    7 |  0.27 |    0.27 |   9.63
e88e062047d6361724e6eadea5c27196 |    7 |  0.27 |    0.27 |   9.90
1766755c40d730188cd4c2fd9a0c6f65 |    6 |  0.23 |    0.23 |  10.13
1db7e0f86b14b72290263bc41eb999c4 |    6 |  0.23 |    0.23 |  10.35
247af3b7f0f6e5e0b58f9538c3978fc5 |    6 |  0.23 |    0.23 |  10.58
55c6c438bf1805c480bf02e536b63e80 |    6 |  0.23 |    0.23 |  10.81
d2d1d7e6b418711e32ef22f254a6f6c1 |    6 |  0.23 |    0.23 |  11.04
fcd07c70c381d20d37c66581d641e021 |    6 |  0.23 |    0.23 |  11.26
002787d3db7ede7f2ccfc8abfbeba161 |    5 |  0.19 |    0.19 |  11.45
068f142dfe72218fcd7029cadd4cd63c |    5 |  0.19 |    0.19 |  11.64
2d03819b8f55f013cb3529cae30d2029 |    5 |  0.19 |    0.19 |  11.83
3dd3b0415552699e12542daa9aa6577b |    5 |  0.19 |    0.19 |  12.02
4547aade2457615177a54672cc0f252b |    5 |  0.19 |    0.19 |  12.21
5ba4609449eec6d1a421075fc5baf6bc |    5 |  0.19 |    0.19 |  12.40
6a06d4ca4502695e841f9b034a496ca2 |    5 |  0.19 |    0.19 |  12.59
82d63eca49c243b9756f84ee4bf979b4 |    5 |  0.19 |    0.19 |  12.78
a3a1feb9af68caa20b741e158f6c473a |    5 |  0.19 |    0.19 |  12.97
b0f0f6ba8eeb4d82e0626a0c77d8693c |    5 |  0.19 |    0.19 |  13.16
b15bb13a3e7aaf7e9dc435efe2e0e7cb |    5 |  0.19 |    0.19 |  13.35
b2d05eb93c4e8a0567155cd745c46e5a |    5 |  0.19 |    0.19 |  13.54
b558085daf9d04948ac6e367a3e4ccbf |    5 |  0.19 |    0.19 |  13.73
d7bab7e2527bdc614da0192db7b787c0 |    5 |  0.19 |    0.19 |  13.92
ec9644f848fe71ceb4a0b12fd5ed669f |    5 |  0.19 |    0.19 |  14.11
n < 5                            | 2265 | 85.89 |   85.89 | 100.00
<NA>                             |    0 |  0.00 |    <NA> |   <NA>

Mining tweets: profile(s)

Data collection

get_all_tweets(
    users = c("elonmusk"),
    start_tweets = "2020-11-11T00:00:00Z",
    end_tweets = "2022-11-13T12:00:00Z",
    file = "elonmusk",
    data_path = here("content/04-api_access-twitter/data.local/raw_elonmusk/"),
    n = 100000,
    bearer_token = personal_bearer_token
  )

Read data from disc

tweets_musk <- bind_tweets(
  data_path = here("content/04-api_access-twitter/data.local/raw_elonmusk"),
  # data_path = "data/raw_karneval",
  output_format = "tidy") %>% 
    mutate(
    datetime = ymd_hms(created_at),
    date = date(datetime),
    hour = hour(datetime),
    min  = minute(datetime),
    hms  = hms::as_hms(datetime),
    hm   = hms::parse_hm(hms)
  )

Data analysis

Overview of dataset

tweets_musk %>% glimpse
Rows: 7,255
Columns: 37
$ tweet_id               <chr> "1336809767574982658", "1336808486022258688", "…
$ user_username          <chr> "elonmusk", "elonmusk", "elonmusk", "elonmusk",…
$ text                   <chr> "Fuel header tank pressure was low during landi…
$ conversation_id        <chr> "1336808486022258688", "1336808486022258688", "…
$ author_id              <chr> "44196397", "44196397", "44196397", "44196397",…
$ in_reply_to_user_id    <chr> "44196397", NA, "4914384040", "3101588527", "34…
$ source                 <chr> "Twitter for iPhone", "Twitter for iPhone", "Tw…
$ possibly_sensitive     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
$ created_at             <chr> "2020-12-09T23:07:39.000Z", "2020-12-09T23:02:3…
$ lang                   <chr> "en", "en", "und", "en", "en", "en", "en", "en"…
$ user_created_at        <chr> "2009-06-02T20:12:29.000Z", "2009-06-02T20:12:2…
$ user_description       <chr> "", "", "", "", "", "", "", "", "", "", "", "",…
$ user_profile_image_url <chr> "https://pbs.twimg.com/profile_images/159096873…
$ user_name              <chr> "Elon Musk", "Elon Musk", "Elon Musk", "Elon Mu…
$ user_protected         <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
$ user_verified          <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
$ user_location          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ user_url               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ user_pinned_tweet_id   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ retweet_count          <int> 6987, 9775, 349, 348, 1055, 7732, 10719, 324, 5…
$ like_count             <int> 96961, 106142, 15267, 13204, 22581, 0, 170964, …
$ quote_count            <int> 2027, 2953, 93, 111, 177, 0, 5106, 92, 143, 12,…
$ user_tweet_count       <int> 20290, 20290, 20290, 20290, 20290, 20290, 20290…
$ user_list_count        <int> 102027, 102027, 102027, 102027, 102027, 102027,…
$ user_followers_count   <int> 115405919, 115405919, 115405919, 115405919, 115…
$ user_following_count   <int> 130, 130, 130, 130, 130, 130, 130, 130, 130, 13…
$ sourcetweet_type       <chr> NA, "quoted", NA, NA, NA, "retweeted", NA, NA, …
$ sourcetweet_id         <chr> NA, "1336777137391456256", NA, NA, NA, "1336349…
$ sourcetweet_text       <chr> NA, "Watch Starship high-altitude test live → h…
$ sourcetweet_lang       <chr> NA, "en", NA, NA, NA, "en", NA, NA, NA, NA, "en…
$ sourcetweet_author_id  <chr> NA, "34743251", NA, NA, NA, "34743251", NA, NA,…
$ datetime               <dttm> 2020-12-09 23:07:39, 2020-12-09 23:02:34, 2020…
$ date                   <date> 2020-12-09, 2020-12-09, 2020-12-09, 2020-12-08…
$ hour                   <int> 23, 23, 18, 18, 16, 16, 16, 2, 2, 0, 22, 20, 18…
$ min                    <int> 7, 2, 13, 5, 57, 55, 44, 51, 50, 16, 16, 13, 14…
$ hms                    <time> 23:07:39, 23:02:34, 18:13:21, 18:05:28, 16:57:…
$ hm                     <time> 23:07:00, 23:02:00, 18:13:00, 18:05:00, 16:57:…

Tweets over time

tweets_musk %>% 
  ggplot(aes(date)) +
  geom_bar() +
  theme_pubr()

Tweets with the most likes

tweets_musk %>% 
  filter(is.na(sourcetweet_type)) %>% 
  arrange(-like_count) %>% 
  select(text, created_at, like_count) %>% 
  head(10)
# A tibble: 10 × 3
   text                                                          creat…¹ like_…²
   <chr>                                                         <chr>     <int>
 1 "Next I’m buying Coca-Cola to put the cocaine back in"        2022-0… 4767770
 2 "I hope that even my worst critics remain on Twitter, becaus… 2022-0… 3221746
 3 "Let’s make Twitter maximum fun!"                             2022-0… 2641791
 4 "\U0001f680\U0001f4ab♥️ Yesss!!! ♥️\U0001f4ab\U0001f680 https:… 2022-0… 2599811
 5 "Listen, I can’t do miracles ok https://t.co/z7dvLMUXy8"      2022-0… 2572036
 6 "the bird is freed"                                           2022-1… 2497902
 7 "Comedy is now legal on Twitter"                              2022-1… 2400045
 8 "https://t.co/kGncG7Hs3M"                                     2022-1… 1893281
 9 "If I die under mysterious circumstances, it’s been nice kno… 2022-0… 1891028
10 "The extreme antibody reaction from those who fear free spee… 2022-0… 1647281
# … with abbreviated variable names ¹​created_at, ²​like_count

Tweets with the most retweets

tweets_musk %>% 
  filter(is.na(sourcetweet_type)) %>% 
  arrange(-retweet_count) %>% 
  select(text, created_at, retweet_count) %>% 
  head(10)
# A tibble: 10 × 3
   text                                                          creat…¹ retwe…²
   <chr>                                                         <chr>     <int>
 1 "Next I’m buying Coca-Cola to put the cocaine back in"        2022-0…  679688
 2 "I hope that even my worst critics remain on Twitter, becaus… 2022-0…  366807
 3 "the bird is freed"                                           2022-1…  357937
 4 "\U0001f680\U0001f4ab♥️ Yesss!!! ♥️\U0001f4ab\U0001f680 https:… 2022-0…  346717
 5 "Comedy is now legal on Twitter"                              2022-1…  261224
 6 "Listen, I can’t do miracles ok https://t.co/z7dvLMUXy8"      2022-0…  212059
 7 "https://t.co/Q9OjlJhi7f"                                     2022-0…  207978
 8 "Let’s make Twitter maximum fun!"                             2022-0…  193913
 9 "The extreme antibody reaction from those who fear free spee… 2022-0…  191983
10 "Entering Twitter HQ – let that sink in! https://t.co/D68z4K… 2022-1…  190507
# … with abbreviated variable names ¹​created_at, ²​retweet_count

Proportion of tweets

tweets_musk %>% 
  frq(sourcetweet_type)
sourcetweet_type <character> 
# total N=7255 valid N=489 mean=1.74 sd=0.44

Value     |    N | Raw % | Valid % | Cum. %
-------------------------------------------
quoted    |  125 |  1.72 |   25.56 |  25.56
retweeted |  364 |  5.02 |   74.44 | 100.00
<NA>      | 6766 | 93.26 |    <NA> |   <NA>

Languate of tweets

tweets_musk %>% 
  frq(lang)
lang <character> 
# total N=7255 valid N=7255 mean=14.37 sd=9.86

Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
ar    |    2 |  0.03 |    0.03 |   0.03
art   |   12 |  0.17 |    0.17 |   0.19
bg    |    1 |  0.01 |    0.01 |   0.21
ca    |   10 |  0.14 |    0.14 |   0.34
cs    |    1 |  0.01 |    0.01 |   0.36
cy    |    1 |  0.01 |    0.01 |   0.37
da    |    6 |  0.08 |    0.08 |   0.45
de    |   24 |  0.33 |    0.33 |   0.79
el    |    2 |  0.03 |    0.03 |   0.81
en    | 5915 | 81.53 |   81.53 |  82.34
es    |   15 |  0.21 |    0.21 |  82.55
et    |    6 |  0.08 |    0.08 |  82.63
eu    |    3 |  0.04 |    0.04 |  82.67
fr    |   20 |  0.28 |    0.28 |  82.95
hi    |    1 |  0.01 |    0.01 |  82.96
ht    |    2 |  0.03 |    0.03 |  82.99
hu    |    2 |  0.03 |    0.03 |  83.02
in    |    8 |  0.11 |    0.11 |  83.13
is    |    1 |  0.01 |    0.01 |  83.14
it    |    6 |  0.08 |    0.08 |  83.23
ja    |    5 |  0.07 |    0.07 |  83.29
lt    |    3 |  0.04 |    0.04 |  83.34
nl    |    4 |  0.06 |    0.06 |  83.39
no    |    1 |  0.01 |    0.01 |  83.40
pl    |    5 |  0.07 |    0.07 |  83.47
pt    |    7 |  0.10 |    0.10 |  83.57
qam   |   40 |  0.55 |    0.55 |  84.12
qht   |    1 |  0.01 |    0.01 |  84.14
qme   |   81 |  1.12 |    1.12 |  85.25
qst   |    5 |  0.07 |    0.07 |  85.32
ro    |    3 |  0.04 |    0.04 |  85.36
ru    |    7 |  0.10 |    0.10 |  85.46
sl    |    2 |  0.03 |    0.03 |  85.49
tl    |   59 |  0.81 |    0.81 |  86.30
tr    |    5 |  0.07 |    0.07 |  86.37
uk    |    1 |  0.01 |    0.01 |  86.38
und   |  817 | 11.26 |   11.26 |  97.64
vi    |    1 |  0.01 |    0.01 |  97.66
zh    |    1 |  0.01 |    0.01 |  97.67
zxx   |  169 |  2.33 |    2.33 | 100.00
<NA>  |    0 |  0.00 |    <NA> |   <NA>

Text mining

Preprocessing

remove_html <- "&amp;|&lt;|&gt;"

tweets_en <- tweets_musk %>% 
  filter(lang == "en",
         is.na(sourcetweet_type)) %>% 
  select(tweet_id, text, user_username) %>% 
  mutate(text = str_remove_all(text, remove_html))
tweets_en_corpus <- corpus(tweets_en,
                           docid_field = "tweet_id",
                           text_field = "text")
tweets_en_tokens <- 
  tokens(tweets_en_corpus,
         remove_punct = TRUE,
         remove_numbers = TRUE,
         remove_symbols = TRUE,
         remove_url = TRUE) %>% 
  tokens_tolower() %>% 
  tokens_remove(stopwords("english"))
tweets_en_dfm <- dfm(tweets_en_tokens)

Analysis

Top Hashtags

tag_dfm <- dfm_select(tweets_en_dfm, pattern = "#*")
toptag <- names(topfeatures(tag_dfm, 50))
head(toptag, 10)
[1] "#2"                  "#mars"               "#resistanceisfutile"
[4] "#1"                  "#freespeech"        

Top Mentions

user_dfm <- dfm_select(tweets_en_dfm, pattern = "@*")
topuser <- names(topfeatures(user_dfm, 50))
head(topuser, 10)
 [1] "@wholemarsblog"  "@spacex"         "@teslaownerssv"  "@ppathole"      
 [5] "@tesla"          "@erdayastronaut" "@billym2k"       "@teslarati"     
 [9] "@sawyermerritt"  "@evafoxu"       

Exclude Hashtags & Metions

tweets_en_clean <- tweets_en_dfm %>% 
  dfm_remove(pattern = "@*") %>% 
  dfm_remove(pattern = "#*")

Top 10 features

term_freq_en <- textstat_frequency(tweets_en_clean)
head(term_freq_en, n = 10)
   feature frequency rank docfreq group
1    tesla       354    1     328   all
2     just       227    2     225   all
3     good       215    3     209   all
4    great       187    4     183   all
5     much       184    5     180   all
6     like       172    6     168   all
7      can       169    7     165   all
8   people       167    8     156   all
9  twitter       156    9     147   all
10     one       146   10     144   all

Wordcloud with Top 50 features

textplot_wordcloud(tweets_en_clean, max_words = 50)

References

Barrie, C., & Ho, J. (2021). academictwitteR: An r package to access the twitter academic research product track v2 API endpoint. Journal of Open Source Software, 6(62), 3272. https://doi.org/10.21105/joss.03272