Showcase
Focus on Zeeschuimer
Background
This showcase is intended to illustrate different analysis possibilities of TikTok data downloaded with the Zeeschuimer
browser extension.
Data analysis
TikToks that are tagged with the hashtag
statistics
collected via
Zeeschuimer
with .csv export via 🐈🐈 4CAT 🐈🐈
Data import from
# load packages
library(here)
library(tidyverse)
library(readr)
<- read_csv(
statistics here("content/07-webscraping-tiktok/data.local/tiktok-search-statistics.csv"),
col_types = cols(author_followers = col_number()))
# Anonymization of potentially sensitive information
<- statistics %>%
statistics_hash mutate(
across(c(
:body,
id:thumbnail_url),
video_url~ v_digest(.)))
# quick preview
%>% glimpse() statistics_hash
Rows: 941
Columns: 24
$ id <chr> "c158c50de9203a700525c2273c722f55", "5794a6967c21834c…
$ thread_id <chr> "c158c50de9203a700525c2273c722f55", "5794a6967c21834c…
$ author <chr> "1e25edc01a1eff924105786baa35cb88", "1e25edc01a1eff92…
$ author_full <chr> "867f5f0cf9b63e68eebb58854d8f779d", "867f5f0cf9b63e68…
$ author_id <chr> "1b94ef47779439aedbf89a519ccb0ac3", "1b94ef47779439ae…
$ author_followers <chr> "34f5624a9f54d961f205fa9ccf2c2816", "34f5624a9f54d961…
$ body <chr> "565707abc618ca27d9f5c9bd718488f0", "63cdbef44fbf560a…
$ timestamp <dttm> 2020-04-09 19:44:39, 2020-05-30 20:28:05, 2020-07-03…
$ unix_timestamp <dbl> 1586461479, 1590870485, 1593811442, 1612846242, 16420…
$ is_duet <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ music_name <chr> "SexyBack", "original sound", "original sound", "orig…
$ music_id <dbl> 6.696418e+18, 6.832737e+18, 6.845368e+18, 6.927122e+1…
$ music_url <chr> "https://sf16-ies-music-va.tiktokcdn.com/obj/tos-usea…
$ video_url <chr> "1504eb84f490bc0b7a7245feb5b2e58f", "79c191b9dcea9193…
$ tiktok_url <chr> "6aaa20d1f42837af425dd2656b2d87a7", "d53c488f26b5db40…
$ thumbnail_url <chr> "dd1e26c4dd3b8a8f2d8fa595e7f7af7e", "b99c1df931f7ff61…
$ likes <dbl> 1200000, 910000, 901000, 794300, 740300, 701400, 6490…
$ comments <dbl> 7746, 11900, 3020, 36900, 8179, 8150, 34800, 7592, 28…
$ shares <dbl> 23000, 16600, 1755, 64000, 6397, 1685, 93800, 51300, …
$ plays <dbl> 6700000, 3300000, 5100000, 3800000, 2900000, 2500000,…
$ hashtags <chr> "fyp,love,dating,romance,relationship,crush,people,po…
$ stickers <chr> NA, NA, NA, "that one guy", "Ok…but I guess Timmy is …
$ effects <chr> NA, NA, NA, NA, "Greenscreen", NA, "Disco", NA, "TapT…
$ warning <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
Exploration
Periode in which the TikToks were posted
# Load packages
library(lubridate)
library(sjPlot)
library(ggpubr)
# Display
%>%
statistics mutate(date = as.factor(year(timestamp))) %>%
plot_frq(date) +
theme_pubr()
Location parameters of different statistics
library(sjmisc)
%>%
statistics select(likes:plays) %>%
descr()
## Basic descriptive statistics
var type label n NA.prc mean sd se md
likes numeric likes 941 0 50412.33 110696.23 3608.59 16600
comments numeric comments 941 0 980.51 2380.29 77.60 351
shares numeric shares 941 0 1349.89 4755.66 155.03 262
plays numeric plays 941 0 384388.52 750096.58 24452.45 153700
trimmed range iqr skew
26511.33 1395280 (4720-1400000) 37830 6.14
537.56 36900 (0-36900) 791 8.94
527.19 93796 (4-93800) 820 12.24
220367.46 7985100 (14900-8e+06) 309300 5.44
Distribution of likes
%>%
statistics plot_frq(likes, type = "density")
Warning messages displayed
%>%
statistics frq(warning)
warning <character>
# total N=941 valid N=24 mean=1.88 sd=0.80
Value | N | Raw % | Valid % | Cum. %
-----------------------------------------------------------------------------------------------------------------------------------------
Learn more about COVID-19 vaccines | 9 | 0.96 | 37.50 | 37.50
Learn the facts about COVID-19 | 9 | 0.96 | 37.50 | 75.00
The actions in this video are performed by professionals or supervised by professionals. Do not attempt. | 6 | 0.64 | 25.00 | 100.00
<NA> | 917 | 97.45 | <NA> | <NA>
Text analysis
Corpus creation
# Load packages
library(quanteda)
# Create corpus based on variable hashtags
<- corpus(
crp
statistics_hash, docid_field = "id",
text_field = "hashtags")
# Display
crp
Corpus consisting of 941 documents and 22 docvars.
c158c50de9203a700525c2273c722f55 :
"fyp,love,dating,romance,relationship,crush,people,population..."
5794a6967c21834c4808d8ad0bc13e05 :
"fyp,blacklivesmatter,tiktokpartner,learnontiktok,police,fact..."
abb283c2d6d427232ad29eb35ebf944e :
"skittles,statistics,education,fyp,foryou"
97b1104022105b411a2f9c54ce5740f7 :
"hotguy,itwasntme,turbotaxlivepick6,doritosflatlife,foryou,wa..."
01f2b8936aea8ccc94ac1c8c3a1e7970 :
"timotheechalamet,fyp,foryou,timothee,peach,callmebyyourname,..."
c25da2e0796117dcedb418291a8bdd10 :
"stitch,statistics,staticstics,fyp,foryoupage,trending"
[ reached max_ndoc ... 935 more documents ]
Tokenization
# Create tokens based on corpus
<- crp %>%
tkn tokens(
remove_punct = TRUE,
remove_symbols = TRUE,
remove_url = TRUE,
remove_separators = TRUE)
# Display
tkn
Tokens consisting of 941 documents and 22 docvars.
c158c50de9203a700525c2273c722f55 :
[1] "fyp" "love" "dating" "romance" "relationship"
[6] "crush" "people" "population" "world" "math"
[11] "stats" "statistics"
5794a6967c21834c4808d8ad0bc13e05 :
[1] "fyp" "blacklivesmatter" "tiktokpartner" "learnontiktok"
[5] "police" "facts" "fact" "statistics"
[9] "usa"
abb283c2d6d427232ad29eb35ebf944e :
[1] "skittles" "statistics" "education" "fyp" "foryou"
97b1104022105b411a2f9c54ce5740f7 :
[1] "hotguy" "itwasntme" "turbotaxlivepick6"
[4] "doritosflatlife" "foryou" "wap"
[7] "statistics" "fyp" "foryoupage"
[10] "wap"
01f2b8936aea8ccc94ac1c8c3a1e7970 :
[1] "timotheechalamet" "fyp" "foryou" "timothee"
[5] "peach" "callmebyyourname" "statistics"
c25da2e0796117dcedb418291a8bdd10 :
[1] "stitch" "statistics" "staticstics" "fyp" "foryoupage"
[6] "trending"
[ reached max_ndoc ... 935 more documents ]
Create Document-Feature-Matrix (DFM)
# Create dfm based on tokens
<- tkn %>%
dfm dfm()
# Display
dfm
Document-feature matrix of: 941 documents, 2,940 features (99.71% sparse) and 22 docvars.
features
docs fyp love dating romance relationship crush
c158c50de9203a700525c2273c722f55 1 1 1 1 1 1
5794a6967c21834c4808d8ad0bc13e05 1 0 0 0 0 0
abb283c2d6d427232ad29eb35ebf944e 1 0 0 0 0 0
97b1104022105b411a2f9c54ce5740f7 1 0 0 0 0 0
01f2b8936aea8ccc94ac1c8c3a1e7970 1 0 0 0 0 0
c25da2e0796117dcedb418291a8bdd10 1 0 0 0 0 0
features
docs people population world math
c158c50de9203a700525c2273c722f55 1 1 1 1
5794a6967c21834c4808d8ad0bc13e05 0 0 0 0
abb283c2d6d427232ad29eb35ebf944e 0 0 0 0
97b1104022105b411a2f9c54ce5740f7 0 0 0 0
01f2b8936aea8ccc94ac1c8c3a1e7970 0 0 0 0
c25da2e0796117dcedb418291a8bdd10 0 0 0 0
[ reached max_ndoc ... 935 more documents, reached max_nfeat ... 2,930 more features ]
Wordcloud
library(quanteda.textplots)
%>%
dfm textplot_wordcloud(
min_size = 1,
max_size = 8,
max_words = 50,
rotation = 0
)
without the searchterm statistics
%>%
dfm dfm_remove(pattern = "statistics") %>%
textplot_wordcloud(
min_size = 1,
max_size = 8,
max_words = 50,
rotation = 0,
color = "dodgerblue3"
)