Title: | A 'Shiny' Application for Inspecting Structural Topic Models |
---|---|
Description: | This app enables interactive validation, interpretation and visualization of structural topic models from the 'stm' package by Roberts and others (2014) <doi:10.1111/ajps.12103>. It also includes helper functions for model diagnostics and extracting data from effect estimates. |
Authors: | Carsten Schwemmer [aut, cre] , Jonne Guyt [ctb] |
Maintainer: | Carsten Schwemmer <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.4.3 |
Built: | 2024-10-25 05:49:06 UTC |
Source: | https://github.com/cschwem2er/stminsights |
get_diag()
is a helper function to compute average and median
semanticCoherence
and exclusivity
for
a number of stm
models. The function does not work for
models with content covariates.
get_diag(models, outobj)
get_diag(models, outobj)
models |
A list of stm models. |
outobj |
The |
Returns model diagnostics in a data frame.
library(stm) library(dplyr) library(ggplot2) library(quanteda) # prepare data data <- corpus(gadarian, text_field = 'open.ended.response') docvars(data)$text <- as.character(data) data <- tokens(data, remove_punct = TRUE) |> tokens_wordstem() |> tokens_remove(stopwords('english')) |> dfm() |> dfm_trim(min_termfreq = 2) out <- convert(data, to = 'stm') # fit models gadarian_3 <- stm(documents = out$documents, vocab = out$vocab, data = out$meta, prevalence = ~ treatment + s(pid_rep), K = 3, max.em.its = 1, # reduce computation time for example verbose = FALSE) gadarian_5 <- stm(documents = out$documents, vocab = out$vocab, data = out$meta, prevalence = ~ treatment + s(pid_rep), K = 5, max.em.its = 1, # reduce computation time for example verbose = FALSE) # get diagnostics diag <- get_diag(models = list( model_3 = gadarian_3, model_5 = gadarian_5), outobj = out) ## Not run: # plot diagnostics diag |> ggplot(aes(x = coherence, y = exclusivity, color = statistic)) + geom_text(aes(label = name), nudge_x = 5) + geom_point() + labs(x = 'Semantic Coherence', y = 'Exclusivity') + theme_light() ## End(Not run)
library(stm) library(dplyr) library(ggplot2) library(quanteda) # prepare data data <- corpus(gadarian, text_field = 'open.ended.response') docvars(data)$text <- as.character(data) data <- tokens(data, remove_punct = TRUE) |> tokens_wordstem() |> tokens_remove(stopwords('english')) |> dfm() |> dfm_trim(min_termfreq = 2) out <- convert(data, to = 'stm') # fit models gadarian_3 <- stm(documents = out$documents, vocab = out$vocab, data = out$meta, prevalence = ~ treatment + s(pid_rep), K = 3, max.em.its = 1, # reduce computation time for example verbose = FALSE) gadarian_5 <- stm(documents = out$documents, vocab = out$vocab, data = out$meta, prevalence = ~ treatment + s(pid_rep), K = 5, max.em.its = 1, # reduce computation time for example verbose = FALSE) # get diagnostics diag <- get_diag(models = list( model_3 = gadarian_3, model_5 = gadarian_5), outobj = out) ## Not run: # plot diagnostics diag |> ggplot(aes(x = coherence, y = exclusivity, color = statistic)) + geom_text(aes(label = name), nudge_x = 5) + geom_point() + labs(x = 'Semantic Coherence', y = 'Exclusivity') + theme_light() ## End(Not run)
get_effects()
is a helper function to store effect estimates from
stm in a data frame.
get_effects( estimates, variable, type, ci = 0.95, moderator = NULL, modval = NULL, cov_val1 = NULL, cov_val2 = NULL )
get_effects( estimates, variable, type, ci = 0.95, moderator = NULL, modval = NULL, cov_val1 = NULL, cov_val2 = NULL )
estimates |
The object containing estimates calculated with
|
variable |
The variable for which estimates should be extracted. |
type |
The estimate type. Must be either |
ci |
The confidence interval for uncertainty estimates.
Defaults to |
moderator |
The moderator variable in case you want to include an interaction effect. |
modval |
The value of the moderator variable for an interaction effect. See examples for combining data for multiple values. |
cov_val1 |
The first value of a covariate for type |
cov_val2 |
The second value of a covariate for type |
Returns effect estimates in a tidy data frame.
library(stm) library(dplyr) library(ggplot2) # store effects prep <- estimateEffect(1:3 ~ treatment + pid_rep, gadarianFit, gadarian) effects <- get_effects(estimates = prep, variable = 'treatment', type = 'pointestimate') # plot effects effects |> filter(topic == 3) |> ggplot(aes(x = value, y = proportion)) + geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) + geom_point(size = 3) + coord_flip() + theme_light() + labs(x = 'Treatment', y = 'Topic Proportion') # combine estimates for interaction effects prep_int <- estimateEffect(1:3 ~ treatment * s(pid_rep), gadarianFit, gadarian) effects_int <- get_effects(estimates = prep_int, variable = 'pid_rep', type = 'continuous', moderator = 'treatment', modval = 1) |> bind_rows( get_effects(estimates = prep_int, variable = 'pid_rep', type = 'continuous', moderator = 'treatment', modval = 0) ) # plot interaction effects effects_int |> filter(topic == 2) |> mutate(moderator = as.factor(moderator)) |> ggplot(aes(x = value, y = proportion, color = moderator, group = moderator, fill = moderator)) + geom_line() + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) + theme_light() + labs(x = 'PID Rep.', y = 'Topic Proportion', color = 'Treatment', group = 'Treatment', fill = 'Treatment')
library(stm) library(dplyr) library(ggplot2) # store effects prep <- estimateEffect(1:3 ~ treatment + pid_rep, gadarianFit, gadarian) effects <- get_effects(estimates = prep, variable = 'treatment', type = 'pointestimate') # plot effects effects |> filter(topic == 3) |> ggplot(aes(x = value, y = proportion)) + geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) + geom_point(size = 3) + coord_flip() + theme_light() + labs(x = 'Treatment', y = 'Topic Proportion') # combine estimates for interaction effects prep_int <- estimateEffect(1:3 ~ treatment * s(pid_rep), gadarianFit, gadarian) effects_int <- get_effects(estimates = prep_int, variable = 'pid_rep', type = 'continuous', moderator = 'treatment', modval = 1) |> bind_rows( get_effects(estimates = prep_int, variable = 'pid_rep', type = 'continuous', moderator = 'treatment', modval = 0) ) # plot interaction effects effects_int |> filter(topic == 2) |> mutate(moderator = as.factor(moderator)) |> ggplot(aes(x = value, y = proportion, color = moderator, group = moderator, fill = moderator)) + geom_line() + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) + theme_light() + labs(x = 'PID Rep.', y = 'Topic Proportion', color = 'Treatment', group = 'Treatment', fill = 'Treatment')
get_network()
is a helper function to extract topic correlation networks
as tidygraph objects and add labels and topic proportions.
model |
The stm model for computing the correlation network. |
method |
The method for determining edges. Can be either |
cutoff |
The correlation cutoff criterion for |
labels |
An optional vector of topic labels. Must include a label for each topic of the model. |
cutiso |
Remove isolated notes without any edges from the network. Defaults to |
Returns tidygraph network of topic correlations.
library(stm) library(ggraph) library(quanteda) # prepare data data <- corpus(gadarian, text_field = 'open.ended.response') docvars(data)$text <- as.character(data) data <- tokens(data, remove_punct = TRUE) |> tokens_wordstem() |> tokens_remove(stopwords('english')) |> dfm() |> dfm_trim(min_termfreq = 2) out <- convert(data, to = 'stm') # fit model gadarian_10 <- stm(documents = out$documents, vocab = out$vocab, data = out$meta, prevalence = ~ treatment + s(pid_rep), K = 10, max.em.its = 1, # reduce computation time for example verbose = FALSE) ## Not run: # extract network stm_corrs <- get_network(model = gadarian_10, method = 'simple', labels = paste('Topic', 1:10), cutoff = 0.001, cutiso = TRUE) # plot network ggraph(stm_corrs, layout = 'auto') + geom_edge_link( aes(edge_width = weight), label_colour = '#fc8d62', edge_colour = '#377eb8') + geom_node_point(size = 4, colour = 'black') + geom_node_label( aes(label = name, size = props), colour = 'black', repel = TRUE, alpha = 0.85) + scale_size(range = c(2, 10), labels = scales::percent) + labs(size = 'Topic Proportion', edge_width = 'Topic Correlation') + scale_edge_width(range = c(1, 3)) + theme_graph() ## End(Not run)
library(stm) library(ggraph) library(quanteda) # prepare data data <- corpus(gadarian, text_field = 'open.ended.response') docvars(data)$text <- as.character(data) data <- tokens(data, remove_punct = TRUE) |> tokens_wordstem() |> tokens_remove(stopwords('english')) |> dfm() |> dfm_trim(min_termfreq = 2) out <- convert(data, to = 'stm') # fit model gadarian_10 <- stm(documents = out$documents, vocab = out$vocab, data = out$meta, prevalence = ~ treatment + s(pid_rep), K = 10, max.em.its = 1, # reduce computation time for example verbose = FALSE) ## Not run: # extract network stm_corrs <- get_network(model = gadarian_10, method = 'simple', labels = paste('Topic', 1:10), cutoff = 0.001, cutiso = TRUE) # plot network ggraph(stm_corrs, layout = 'auto') + geom_edge_link( aes(edge_width = weight), label_colour = '#fc8d62', edge_colour = '#377eb8') + geom_node_point(size = 4, colour = 'black') + geom_node_label( aes(label = name, size = props), colour = 'black', repel = TRUE, alpha = 0.85) + scale_size(range = c(2, 10), labels = scales::percent) + labs(size = 'Topic Proportion', edge_width = 'Topic Correlation') + scale_edge_width(range = c(1, 3)) + theme_graph() ## End(Not run)
run_stminsights
launches the app to analyze Structural Topic models.
It requires a .RData file with stm objects as illustrated in the example below.
run_stminsights(use_browser = TRUE)
run_stminsights(use_browser = TRUE)
use_browser |
Choose whether you want to launch the shiny app in your browser.
Defaults to |
## Not run: library(stm) library(quanteda) # prepare data data <- corpus(gadarian, text_field = 'open.ended.response') docvars(data)$text <- as.character(data) data <- tokens(data, remove_punct = TRUE) |> tokens_wordstem() |> tokens_remove(stopwords('english')) |> dfm() |> dfm_trim(min_termfreq = 2) out <- convert(data, to = 'stm') # fit models and effect estimates gadarian_3 <- stm(documents = out$documents, vocab = out$vocab, data = out$meta, prevalence = ~ treatment + s(pid_rep), K = 3, max.em.its = 1, # reduce computation time for example verbose = FALSE) prep_3 <- estimateEffect(1:3 ~ treatment + s(pid_rep), gadarian_3, meta = out$meta) gadarian_5 <- stm(documents = out$documents, vocab = out$vocab, data = out$meta, prevalence = ~ treatment + s(pid_rep), K = 5, max.em.its = 1, # reduce computation time for example verbose = FALSE) prep_5 <- estimateEffect(1:5 ~ treatment + s(pid_rep), gadarian_5, meta = out$meta) # save objects in .RData file save.image(paste0(tempdir(), '/stm_gadarian.RData')) # launch the app if(interactive()){ run_stminsights() } ## End(Not run)
## Not run: library(stm) library(quanteda) # prepare data data <- corpus(gadarian, text_field = 'open.ended.response') docvars(data)$text <- as.character(data) data <- tokens(data, remove_punct = TRUE) |> tokens_wordstem() |> tokens_remove(stopwords('english')) |> dfm() |> dfm_trim(min_termfreq = 2) out <- convert(data, to = 'stm') # fit models and effect estimates gadarian_3 <- stm(documents = out$documents, vocab = out$vocab, data = out$meta, prevalence = ~ treatment + s(pid_rep), K = 3, max.em.its = 1, # reduce computation time for example verbose = FALSE) prep_3 <- estimateEffect(1:3 ~ treatment + s(pid_rep), gadarian_3, meta = out$meta) gadarian_5 <- stm(documents = out$documents, vocab = out$vocab, data = out$meta, prevalence = ~ treatment + s(pid_rep), K = 5, max.em.its = 1, # reduce computation time for example verbose = FALSE) prep_5 <- estimateEffect(1:5 ~ treatment + s(pid_rep), gadarian_5, meta = out$meta) # save objects in .RData file save.image(paste0(tempdir(), '/stm_gadarian.RData')) # launch the app if(interactive()){ run_stminsights() } ## End(Not run)