Showcase

Focus on Zeeschuimer

Tip

Open this showcase in other interactive and executable environments:

Background

This showcase is intended to illustrate different analysis possibilities of TikTok data downloaded with the Zeeschuimer browser extension.

Data analysis

TikToks that are tagged with the hashtag statistics
collected via Zeeschuimer with .csv export via 🐈🐈 4CAT 🐈🐈

Data import from

# load packages
library(here)
library(tidyverse)
library(readr)

statistics <- read_csv(
  here("content/07-webscraping-tiktok/data.local/tiktok-search-statistics.csv"), 
  col_types = cols(author_followers = col_number()))

# Anonymization of potentially sensitive information
statistics_hash <- statistics %>% 
  mutate(
     across(c(
       id:body,
       video_url:thumbnail_url), 
       ~ v_digest(.)))

# quick preview
statistics_hash %>% glimpse()

Rows: 941
Columns: 24
$ id               <chr> "c158c50de9203a700525c2273c722f55", "5794a6967c21834c…
$ thread_id        <chr> "c158c50de9203a700525c2273c722f55", "5794a6967c21834c…
$ author           <chr> "1e25edc01a1eff924105786baa35cb88", "1e25edc01a1eff92…
$ author_full      <chr> "867f5f0cf9b63e68eebb58854d8f779d", "867f5f0cf9b63e68…
$ author_id        <chr> "1b94ef47779439aedbf89a519ccb0ac3", "1b94ef47779439ae…
$ author_followers <chr> "34f5624a9f54d961f205fa9ccf2c2816", "34f5624a9f54d961…
$ body             <chr> "565707abc618ca27d9f5c9bd718488f0", "63cdbef44fbf560a…
$ timestamp        <dttm> 2020-04-09 19:44:39, 2020-05-30 20:28:05, 2020-07-03…
$ unix_timestamp   <dbl> 1586461479, 1590870485, 1593811442, 1612846242, 16420…
$ is_duet          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ music_name       <chr> "SexyBack", "original sound", "original sound", "orig…
$ music_id         <dbl> 6.696418e+18, 6.832737e+18, 6.845368e+18, 6.927122e+1…
$ music_url        <chr> "https://sf16-ies-music-va.tiktokcdn.com/obj/tos-usea…
$ video_url        <chr> "1504eb84f490bc0b7a7245feb5b2e58f", "79c191b9dcea9193…
$ tiktok_url       <chr> "6aaa20d1f42837af425dd2656b2d87a7", "d53c488f26b5db40…
$ thumbnail_url    <chr> "dd1e26c4dd3b8a8f2d8fa595e7f7af7e", "b99c1df931f7ff61…
$ likes            <dbl> 1200000, 910000, 901000, 794300, 740300, 701400, 6490…
$ comments         <dbl> 7746, 11900, 3020, 36900, 8179, 8150, 34800, 7592, 28…
$ shares           <dbl> 23000, 16600, 1755, 64000, 6397, 1685, 93800, 51300, …
$ plays            <dbl> 6700000, 3300000, 5100000, 3800000, 2900000, 2500000,…
$ hashtags         <chr> "fyp,love,dating,romance,relationship,crush,people,po…
$ stickers         <chr> NA, NA, NA, "that one guy", "Ok…but I guess Timmy is …
$ effects          <chr> NA, NA, NA, NA, "Greenscreen", NA, "Disco", NA, "TapT…
$ warning          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

Exploration

Tip

The following graphics (and especially their labels) may appear very small. To view the graphics in their original size, right-click on the images and select “Open image/graphic in new tab”.

Periode in which the TikToks were posted

# Load packages
library(lubridate)
library(sjPlot)
library(ggpubr)

# Display 
statistics %>% 
  mutate(date  = as.factor(year(timestamp))) %>% 
  plot_frq(date) +
  theme_pubr()

Location parameters of different statistics

library(sjmisc)

statistics %>% 
  select(likes:plays) %>% 
  descr()


## Basic descriptive statistics

      var    type    label   n NA.prc      mean        sd       se     md
    likes numeric    likes 941      0  50412.33 110696.23  3608.59  16600
 comments numeric comments 941      0    980.51   2380.29    77.60    351
   shares numeric   shares 941      0   1349.89   4755.66   155.03    262
    plays numeric    plays 941      0 384388.52 750096.58 24452.45 153700
   trimmed                  range    iqr  skew
  26511.33 1395280 (4720-1400000)  37830  6.14
    537.56        36900 (0-36900)    791  8.94
    527.19        93796 (4-93800)    820 12.24
 220367.46  7985100 (14900-8e+06) 309300  5.44

Distribution of likes

statistics %>% 
  plot_frq(likes, type = "density")

Warning messages displayed

statistics %>% 
  frq(warning)

warning <character> 
# total N=941 valid N=24 mean=1.88 sd=0.80

Value                                                                                                    |   N | Raw % | Valid % | Cum. %
-----------------------------------------------------------------------------------------------------------------------------------------
Learn more about COVID-19 vaccines                                                                       |   9 |  0.96 |   37.50 |  37.50
Learn the facts about COVID-19                                                                           |   9 |  0.96 |   37.50 |  75.00
The actions in this video are performed by professionals or supervised by professionals. Do not attempt. |   6 |  0.64 |   25.00 | 100.00
<NA>                                                                                                     | 917 | 97.45 |    <NA> |   <NA>

Text analysis

Corpus creation

# Load packages
library(quanteda)

# Create corpus based on variable hashtags
crp <- corpus(
  statistics_hash, 
  docid_field = "id",
  text_field = "hashtags")

# Display
crp

Corpus consisting of 941 documents and 22 docvars.
c158c50de9203a700525c2273c722f55 :
"fyp,love,dating,romance,relationship,crush,people,population..."

5794a6967c21834c4808d8ad0bc13e05 :
"fyp,blacklivesmatter,tiktokpartner,learnontiktok,police,fact..."

abb283c2d6d427232ad29eb35ebf944e :
"skittles,statistics,education,fyp,foryou"

97b1104022105b411a2f9c54ce5740f7 :
"hotguy,itwasntme,turbotaxlivepick6,doritosflatlife,foryou,wa..."

01f2b8936aea8ccc94ac1c8c3a1e7970 :
"timotheechalamet,fyp,foryou,timothee,peach,callmebyyourname,..."

c25da2e0796117dcedb418291a8bdd10 :
"stitch,statistics,staticstics,fyp,foryoupage,trending"

[ reached max_ndoc ... 935 more documents ]

Tokenization

# Create tokens based on corpus
tkn <- crp %>% 
  tokens(
    remove_punct = TRUE,
    remove_symbols = TRUE,
    remove_url = TRUE,
    remove_separators = TRUE)

# Display
tkn

Tokens consisting of 941 documents and 22 docvars.
c158c50de9203a700525c2273c722f55 :
 [1] "fyp"          "love"         "dating"       "romance"      "relationship"
 [6] "crush"        "people"       "population"   "world"        "math"        
[11] "stats"        "statistics"  

5794a6967c21834c4808d8ad0bc13e05 :
[1] "fyp"              "blacklivesmatter" "tiktokpartner"    "learnontiktok"   
[5] "police"           "facts"            "fact"             "statistics"      
[9] "usa"             

abb283c2d6d427232ad29eb35ebf944e :
[1] "skittles"   "statistics" "education"  "fyp"        "foryou"    

97b1104022105b411a2f9c54ce5740f7 :
 [1] "hotguy"            "itwasntme"         "turbotaxlivepick6"
 [4] "doritosflatlife"   "foryou"            "wap"              
 [7] "statistics"        "fyp"               "foryoupage"       
[10] "wap"              

01f2b8936aea8ccc94ac1c8c3a1e7970 :
[1] "timotheechalamet" "fyp"              "foryou"           "timothee"        
[5] "peach"            "callmebyyourname" "statistics"      

c25da2e0796117dcedb418291a8bdd10 :
[1] "stitch"      "statistics"  "staticstics" "fyp"         "foryoupage" 
[6] "trending"   

[ reached max_ndoc ... 935 more documents ]

Create Document-Feature-Matrix (DFM)

# Create dfm based on tokens
dfm <- tkn %>% 
  dfm()

# Display
dfm

Document-feature matrix of: 941 documents, 2,940 features (99.71% sparse) and 22 docvars.
                                  features
docs                               fyp love dating romance relationship crush
  c158c50de9203a700525c2273c722f55   1    1      1       1            1     1
  5794a6967c21834c4808d8ad0bc13e05   1    0      0       0            0     0
  abb283c2d6d427232ad29eb35ebf944e   1    0      0       0            0     0
  97b1104022105b411a2f9c54ce5740f7   1    0      0       0            0     0
  01f2b8936aea8ccc94ac1c8c3a1e7970   1    0      0       0            0     0
  c25da2e0796117dcedb418291a8bdd10   1    0      0       0            0     0
                                  features
docs                               people population world math
  c158c50de9203a700525c2273c722f55      1          1     1    1
  5794a6967c21834c4808d8ad0bc13e05      0          0     0    0
  abb283c2d6d427232ad29eb35ebf944e      0          0     0    0
  97b1104022105b411a2f9c54ce5740f7      0          0     0    0
  01f2b8936aea8ccc94ac1c8c3a1e7970      0          0     0    0
  c25da2e0796117dcedb418291a8bdd10      0          0     0    0
[ reached max_ndoc ... 935 more documents, reached max_nfeat ... 2,930 more features ]

Wordcloud

library(quanteda.textplots)

dfm %>% 
  textplot_wordcloud(
    min_size = 1,
    max_size = 8,
    max_words = 50,
    rotation = 0
  )

without the searchterm `statistics`

dfm %>% 
  dfm_remove(pattern = "statistics") %>% 
  textplot_wordcloud(
    min_size = 1,
    max_size = 8,
    max_words = 50,
    rotation = 0,
    color = "dodgerblue3"
  )

Background

Data analysis

Data import from

Exploration

Periode in which the TikToks were posted

Location parameters of different statistics

Distribution of likes

Warning messages displayed

Text analysis

Corpus creation

Tokenization

Create Document-Feature-Matrix (DFM)

Wordcloud

without the searchterm statistics

without the searchterm `statistics`