setwd("~/Dropbox (Personal)/Ciencia de dados")
usos <- read_excel("usos.xlsx", sheet = "Usos")
# Atividade 1: caixa de papelão
# Atividade 2: tijolos
# Seleciona atividade 1
dt_cx <- usos %>% filter(ativi==1 & !is.na(resposta)) %>% select(1:5)
# Calcula fluência e examina distribuiçào
dt_cx %>% group_by(ID, pre_pos) %>% count %>%
arrange(ID, pre_pos) %>% filter(n <25) %>%
ggplot(aes(x=n)) +
geom_histogram(binwidth = 2, color = "white", fill = "gray") +
scale_x_continuous(breaks = seq(1:25), limits = c(1, 25))
# Correlação entre fluência
dt_cx %>% group_by(ID, pre_pos) %>% count %>%
arrange(ID, pre_pos) %>% filter(n <25) %>%
spread(key = pre_pos, value = n) %>%
ggplot(aes(x=`1`, y=`2`)) + geom_point()
# Tokenize palvras
dt_cx2 <- dt_cx %>% unnest_tokens(words, resposta)
length(unique(dt_cx2$words))
## [1] 1242
# Lê stopwords
stopwords <- read_csv(
file = "http://www.labape.com.br/rprimi/ds/stopwords.txt",
col_names = FALSE)
names(stopwords) = "words"
dt_cx2 %>% count(words, sort = TRUE)
## # A tibble: 1,242 x 2
## words n
## <chr> <int>
## 1 de 760
## 2 para 252
## 3 fazer 212
## 4 guardar 198
## 5 caixa 173
## 6 uma 144
## 7 papelão 141
## 8 casa 139
## 9 um 138
## 10 casinha 113
## # ... with 1,232 more rows
# Palavras gerais
dt_cx2 %>%
count(words, sort = TRUE) %>%
filter(n > 50 ) %>%
mutate(words = reorder(words, n)) %>%
ggplot(aes(words, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
# Remove stopwords
dt_cx2 %>%
count(words, sort = TRUE) %>%
anti_join(stopwords) %>%
filter(n > 15) %>%
mutate(word = reorder(words, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
# Word cloud
library(wordcloud)
dt_cx2 %>%
anti_join(stopwords) %>%
count(words) %>%
with(wordcloud(words, n,
colors = brewer.pal(12, "Set1"),
max.words = 100))
stopwords[221, "words"] <- "pra"
stopwords[222, "words"] <- "dá"
Veja: Beaty, R. E., & Silvia, P. J. (2012). Why do ideas get more creative across time? An executive interpretation of the serial order effect in divergent thinking tasks. Psychology of Aesthetics, Creativity, and the Arts, 6(4), 309-319. http://dx.doi.org/10.1037/a0029171
word_freq <- dt_cx2 %>%
count(words, sort = TRUE)
dt_cx2 <- dt_cx2 %>% left_join(word_freq)
names(dt_cx2 )[6] <- "frq"
dt_cx2 %>%
ggplot(aes(x=resp_num)) +
geom_histogram(
aes(y = (..count..)/sum(..count..), group = 1),
binwidth = 2,
color = "white",
fill = "gray"
)
dt_cx2 <- dt_cx2 %>%
mutate(
resp_num2 =
case_when(
resp_num <=5 ~ 1,
resp_num > 5 & resp_num <= 10 ~ 2,
resp_num > 10 ~ 3
)
)
table(dt_cx2$resp_num, dt_cx2$resp_num2)
##
## 1 2 3
## 1 729 0 0
## 2 723 0 0
## 3 634 0 0
## 4 630 0 0
## 5 576 0 0
## 6 0 559 0
## 7 0 488 0
## 8 0 443 0
## 9 0 383 0
## 10 0 353 0
## 11 0 0 265
## 12 0 0 228
## 13 0 0 174
## 14 0 0 136
## 15 0 0 117
## 16 0 0 102
## 17 0 0 104
## 18 0 0 66
## 19 0 0 56
## 20 0 0 46
## 21 0 0 53
## 22 0 0 36
## 23 0 0 34
## 24 0 0 32
## 25 0 0 22
## 26 0 0 1
dt_cx2 %>%
anti_join(stopwords) %>%
ggplot(aes(x=frq, fill = as.factor(resp_num2))) +
geom_histogram(
aes(y = (..count..)/sum(..count..), group = 1)) +
facet_grid(resp_num2~.) +
scale_fill_brewer(palette = "Set2")
dt_cx2 %>%
anti_join(stopwords) %>%
ggplot(aes(y=frq, x=resp_num2, fill = as.factor(resp_num2))) +
geom_boxplot(alpha=1/2) +
scale_fill_brewer(palette = "Set1")
fr_pre_pos <- dt_cx2 %>%
count(pre_pos, words, sort = TRUE) %>%
anti_join(stopwords) %>%
group_by(pre_pos) %>%
mutate(soma = sum(n),
prop = n / sum(n)
) %>%
select(-n, -soma) %>%
spread(pre_pos, prop) %>%
mutate(
`1` = ifelse(is.na(`1`), 0 ,`1`),
`2` = ifelse(is.na(`2`), 0 ,`2`))
ggplot(fr_pre_pos, aes(x = `1`, y = `2`)) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5) +
geom_text(aes(label = words), check_overlap = FALSE, just = 1.5) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none")
corr.test(fr_pre_pos[ , 2:3])
## Call:corr.test(x = fr_pre_pos[, 2:3])
## Correlation matrix
## 1 2
## 1 1.00 0.85
## 2 0.85 1.00
## Sample Size
## [1] 1178
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## 1 2
## 1 0 0
## 2 0 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
ggplot(fr_pre_pos, aes(x = `1`, y = `2`)) +
geom_point(alpha=1/20) +
geom_text(mapping = aes(label = words), check_overlap = TRUE, vjust = 1.5)
Term’s inverse document frequency (idf), which decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents. This can be combined with term frequency to calculate a term’s tf-idf (the two quantities multiplied together), the frequency of a term adjusted for how rarely it is used (p. 31)
The idea of tf-idf is to find the important words for the content of each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection or corpus of documents (p. 37)
\[idf_{termo} = \mbox{ln} \frac{n_{documentos}}{n_{documentos.contendo.termo}}\]
# Calcula frequencia de palavras por sujeito (document)
subj_words <- dt_cx2 %>%
count(ID, words, sort = TRUE) %>%
ungroup()
# Calcula total de palavras por sujeito
total_words <- subj_words %>%
group_by(ID) %>%
summarize(total = sum(n))
# Une total de palavras em subj_words
subj_words <- left_join(subj_words, total_words)
# Calcula frequência de palavras por sujeito
subj_words <- subj_words %>%
mutate(term_freq = n/total)
# Calcula número de documentos
subj_words <- subj_words %>%
mutate(n_docs = n_distinct(ID))
# Calcula em quandos documentos cada palavra aparece (document frequency)
subj_words <- subj_words %>%
group_by(words) %>%
mutate(word_doc_freq = n_distinct(ID)/n_docs,
inv_word_doc_freq = n_docs/n_distinct(ID),
ln_inv_word_doc_freq =log(n_docs/n_distinct(ID)))
subj_words <- subj_words %>% filter(words !="0")
subj_words %>%
group_by(words) %>%
summarise(word_doc_freq = mean(word_doc_freq)) %>%
arrange(desc(word_doc_freq)) %>%
mutate(words = reorder(words, word_doc_freq)) %>%
filter(word_doc_freq > .08) %>%
ggplot(aes(words, word_doc_freq)) +
geom_col(aes(fill = word_doc_freq)) +
xlab(NULL) + coord_flip() +
scale_fill_gradientn(
colours = brewer.pal(5 ,"Paired")
)
subj_words %>%
group_by(words) %>%
summarise(word_doc_freq = mean(word_doc_freq)) %>%
anti_join(stopwords) %>%
arrange(desc(word_doc_freq)) %>%
mutate(words = reorder(words, word_doc_freq)) %>%
filter(word_doc_freq > .08) %>%
ggplot(aes(words, word_doc_freq)) +
geom_col(aes(fill = word_doc_freq)) +
xlab(NULL) +
coord_flip() +
scale_fill_gradientn(
colours = brewer.pal(5 ,"Paired")
)
subj_words %>%
ggplot(aes(
x = word_doc_freq,
y = term_freq ,
color = inv_word_doc_freq
)) +
geom_point(alpha=1/4) +
theme_minimal() +
geom_text( aes(label = words),
check_overlap = TRUE,
vjust = 1.5,
size=3
) +
scale_fill_gradientn(
colours = brewer.pal(5 ,"Paired")
)
subj_words %>%
select(2:9) %>%
anti_join(stopwords) %>%
group_by(words) %>%
summarise_all(.funs = mean) %>%
ggplot(aes(
x = word_doc_freq,
y = term_freq ,
color = inv_word_doc_freq
)) +
geom_point(alpha=1/4) +
theme_minimal() +
geom_text( aes(label = words),
check_overlap = TRUE,
vjust = 1.5,
size=3
) +
scale_fill_gradientn(
colours = brewer.pal(5 ,"Paired")
)
subj_words %>%
ggplot(aes(
x = word_doc_freq ,
y = inv_word_doc_freq ,
color = inv_word_doc_freq
)) +
geom_point(alpha=1/4) +
theme_minimal() +
scale_fill_gradientn(
colours = brewer.pal(5 ,"Paired")
)
subj_words %>%
ggplot(aes(
x = word_doc_freq ,
y = ln_inv_word_doc_freq ,
color = inv_word_doc_freq
)) +
geom_point(alpha=1/4) +
theme_minimal() +
scale_fill_gradientn(
colours = brewer.pal(5 ,"Paired")
)
subj_words <- subj_words %>%
bind_tf_idf(term = words, document = ID, n=n)
dtm <- subj_words %>% select(c(1, 2, 12)) %>%
spread(key=words, value = tf_idf)
subj_words %>%
ggplot(aes(
x = tf ,
y = tf_idf ,
color = word_doc_freq
)) +
geom_point(alpha=1/4) +
theme_minimal() +
scale_fill_gradientn(
colours = brewer.pal(5 ,"Paired")
) +
geom_text( aes(label = words),
check_overlap = TRUE,
vjust = 1.5,
size=3
)
source("http://www.labape.com.br/rprimi/R/cria_quartis.R")
table(cut(subj_words$word_doc_freq, breaks = c(0, 0.003875969, 0.011627907, 0.399224806)))
##
## (0,0.00388] (0.00388,0.0116] (0.0116,0.399]
## 712 617 3288
subj_words %>%
select(2:12) %>%
anti_join(stopwords) %>%
group_by(words) %>%
summarise_all(.funs = mean) %>%
ungroup() %>%
mutate(grp_wf = cut(word_doc_freq, breaks = c(0, 0.003, 0.011, 0.20))) %>%
ggplot(aes(
x = tf ,
y = tf_idf ,
color = word_doc_freq
)) +
geom_point(alpha=1/4) +
theme_minimal() +
scale_fill_gradientn(
colours = brewer.pal(5 ,"Paired")
) +
geom_text( aes(label = words),
check_overlap = TRUE,
vjust = 1.5,
size=3
) +
facet_grid(.~grp_wf)
subj_words %>%
arrange(desc(tf_idf))
## # A tibble: 5,027 x 12
## # Groups: words [1,241]
## ID words n total term_freq n_docs word_doc_freq inv_word_doc_fr…
## <dbl> <chr> <int> <int> <dbl> <int> <dbl> <dbl>
## 1 1.12e8 brin… 1 1 1 258 0.0271 36.9
## 2 1.10e8 cabe 22 50 0.44 258 0.00388 258
## 3 1.08e8 préd… 1 2 0.5 258 0.00775 129
## 4 1.10e8 cari… 1 2 0.5 258 0.00775 129
## 5 1.11e8 pôr 5 18 0.278 258 0.00388 258
## 6 1.06e8 casas 1 2 0.5 258 0.0465 21.5
## 7 1.08e8 casas 1 2 0.5 258 0.0465 21.5
## 8 1.09e8 casas 1 2 0.5 258 0.0465 21.5
## 9 1.11e8 metal 2 8 0.25 258 0.00388 258
## 10 1.07e8 cont… 1 4 0.25 258 0.00388 258
## # ... with 5,017 more rows, and 4 more variables:
## # ln_inv_word_doc_freq <dbl>, tf <dbl>, idf <dbl>, tf_idf <dbl>
subj_words %>%
ggplot(aes(x=tf , y= tf_idf, color = word_doc_freq)) +
geom_point(alpha=1/4) +
scale_colour_gradientn(colours = brewer.pal(7, "Paired")) +
scale_y_continuous(seq(0, 2.5, .5), limits=c(0, 2.5))
temp <- subj_words %>%
arrange(ID, desc(tf_idf)) %>%
group_by(ID) %>%
top_n(10) %>%
mutate(word = forcats::fct_reorder(as.factor(words), tf_idf))
temp[1:60, ] %>%
ggplot(aes(word, tf_idf, fill = ID)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~ID, ncol = 2, scales = "free") +
coord_flip()
# install.packages('servr')
library(stringr)
library(text2vec)
prep_fun <- tolower
tok_fun <- word_tokenizer
it = itoken(
dt_cx$resposta,
ids = dt_cx$ID,
preprocessor = prep_fun,
tokenizer = tok_fun,
progressbar = FALSE)
vocab =
create_vocabulary(it,
stopwords = stopwords$words) %>%
prune_vocabulary(
term_count_min = 1,
doc_proportion_max = 0.8
)
vectorizer = vocab_vectorizer(vocab)
dtm = create_dtm(it, vectorizer)
dim(dtm)
## [1] 3082 1178
# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_tfidf = fit_transform(dtm, tfidf)
lda_model = LDA$new(
n_topics = 12,
doc_topic_prior = 0.1,
topic_word_prior = 0.01
)
doc_topic_distr =
lda_model$fit_transform(
x = dtm_tfidf,
n_iter = 1000,
convergence_tol = 0.001,
n_check_convergence = 25,
progressbar = FALSE
)
## INFO [2018-11-25 12:58:48] iter 25 loglikelihood = -56647.167
## INFO [2018-11-25 12:58:48] iter 50 loglikelihood = -54650.540
## INFO [2018-11-25 12:58:49] iter 75 loglikelihood = -54131.180
## INFO [2018-11-25 12:58:49] iter 100 loglikelihood = -53923.703
## INFO [2018-11-25 12:58:49] iter 125 loglikelihood = -53826.124
## INFO [2018-11-25 12:58:49] iter 150 loglikelihood = -53789.327
## INFO [2018-11-25 12:58:49] early stopping at 150 iteration
lda_model$get_top_words(n = 12,
topic_number = c(1:12), lambda = 0.2)
## [,1] [,2] [,3] [,4] [,5]
## [1,] "mesa" "desenhar" "maquete" "lixo" "armário"
## [2,] "celular" "geladeira" "tapete" "roupa" "boneco"
## [3,] "lousa" "mochila" "calendário" "guarda" "letras"
## [4,] "escudo" "chinelo" "arma" "reciclar" "janela"
## [5,] "casas" "aviao" "cartaz" "lápis" "barco"
## [6,] "mentira" "carregar" "livro" "armario" "relogio"
## [7,] "sapatos" "cesta" "desenho" "computador" "bonecos"
## [8,] "bola" "fogão" "garrafa" "gol" "anel"
## [9,] "recortar" "fogao" "óculos" "máscara" "cabe"
## [10,] "escorregador" "mudança" "estante" "vaso" "martelo"
## [11,] "ventilador" "caixinha" "banquinho" "fotos" "vender"
## [12,] "construir" "jogo" "tabuleiro" "mudanças" "tijolo"
## [,6] [,7] [,8] [,9] [,10]
## [1,] "casa" "avião" "brinquedos" "carro" "casinha"
## [2,] "cadeira" "relógio" "caderno" "castelo" "sapato"
## [3,] "boneca" "robô" "estojo" "bolsa" "roupas"
## [4,] "cachorro" "árvore" "cama" "capacete" "escorregar"
## [5,] "caminhão" "enfeites" "faca" "televisão" "livros"
## [6,] "flor" "casinhas" "pintura" "esconder" "grama"
## [7,] "maquiagem" "forte" "mesinha" "bonecas" "gato"
## [8,] "cartas" "colar" "brinco" "quadro" "prateleira"
## [9,] "brincar" "gaiola" "quadros" "banco" "barbie"
## [10,] "pratos" "compras" "alfabeto" "parede" "pasta"
## [11,] "escola" "natal" "colher" "esconderijo" "placas"
## [12,] "cofre" "apoiador" "robo" "sofá" "jogar"
## [,11] [,12]
## [1,] "espada" "carrinho"
## [2,] "armadura" "foguete"
## [3,] "tv" "enfeite"
## [4,] "arminha" "reciclagem"
## [5,] "prédio" "papel"
## [6,] "lapis" "caneta"
## [7,] "carteira" "pote"
## [8,] "cartazes" "cozinha"
## [9,] "escada" "lixinho"
## [10,] "nave" "desenhos"
## [11,] "lousinha" "carrinhos"
## [12,] "objetos" "dinheiro"
lda_model$plot()
library(stringr)
library(text2vec)
prep_fun = tolower
tok_fun = word_tokenizer
it = itoken(dt_cx$resposta,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = dt_cx$ID,
progressbar = FALSE)
vocab = create_vocabulary(
it,
stopwords = stopwords$words,
ngram = c(3, 3)
) %>%
prune_vocabulary(
term_count_min = 1,
doc_proportion_max = 0.8)
dim(vocab)
## [1] 507 3
vectorizer = vocab_vectorizer(vocab)
dtm = create_dtm(it, vectorizer)
dim(dtm)
## [1] 3082 507
# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_tfidf = fit_transform(dtm, tfidf)
dim(dtm_tfidf )
## [1] 3082 507
lda_model = LDA$new(
n_topics =8,
doc_topic_prior = 0.1,
topic_word_prior = 0.01
)
doc_topic_distr =
lda_model$fit_transform(
x = dtm_tfidf, n_iter = 1000,
convergence_tol = 0.001,
n_check_convergence = 25,
progressbar = FALSE
)
## INFO [2018-11-25 12:58:49] iter 25 loglikelihood = -10279.961
## INFO [2018-11-25 12:58:49] iter 50 loglikelihood = -10282.998
## INFO [2018-11-25 12:58:49] early stopping at 50 iteration
barplot(
doc_topic_distr[2, ], xlab = "topic",
ylab = "proportion", ylim = c(0, 1),
names.arg = 1:ncol(doc_topic_distr)
)
lda_model$get_top_words(n = 12,
topic_number = c(1:4), lambda = 0.2)
## [,1] [,2]
## [1,] "dar_moradores_rua" "caixa_guardar_coisas"
## [2,] "brincar_nave_espacial" "fazer_carrinho_papelão"
## [3,] "fazer_porta_celular" "carrinho_roleman_papelao"
## [4,] "fazer_casinha_polly" "caixa_presente_papelão"
## [5,] "capa_caderno_papelão" "fazer_casinha_papelão"
## [6,] "reciclar_todos_tamanhos" "fazer_tv_brincar"
## [7,] "colocar_coisa_velha" "orelha_coelho_páscoa"
## [8,] "casa_caixa_papelão" "fazer_carro_brincar"
## [9,] "b_onecas_papelão" "fazer_brinquedos_infantis"
## [10,] "guardar_pessoa_carro" "usar_manobras_skate"
## [11,] "melhor_fazer_brinquedos" "fazer_ferramentas_brincadeira"
## [12,] "armadura_brincar_guerreiro" "caixa_porta_sapatos"
## [,3] [,4]
## [1,] "caixa_guardar_brinquedos" "guardar_brinquedos_coisas"
## [2,] "guardar_coisas_antigas" "fazer_casinha_barbie"
## [3,] "fazer_casinha_gato" "guardar_filmes_dvd's"
## [4,] "forrar_chão_pintar" "mesinha_colo_papelao"
## [5,] "utilizar_sapatos_dentro" "voce_mudar_casa"
## [6,] "montar_tv_brinquedo" "pode_escorregar_papelão"
## [7,] "guardar_arvore_natal" "caixa_anel_casamento"
## [8,] "caixa_guardar_roupa" "capa_caderno_papelao"
## [9,] "fazer_trabalho_escola" "fazer_cabelo_bonecos"
## [10,] "cartazes_anunciando_algo" "colocar_coisas_mudança"
## [11,] "fazer_enfeites_festas" "caixa_guardar_garrafa"
## [12,] "caminha_cachorro_dormir" "caixa_doação_roupas"
lda_model$plot()
library(readr)
# Lê word embeddings
nilc_wv <- read_delim(
file = "glove_s300.txt",
delim = " ",quote="",
skip = 1,
col_names = FALSE,
progress = TRUE)
names(nilc_wv)[1]<-"words"
names(nilc_wv)[2:301]<-paste("V", 1:300, sep= "")
# Lê word embeddings de 600d
# load("~/Documents/word_vectors/nilc_wv_glove_600d.RData")
# cria vocabulário
vocab <- dt_cx2 %>% count(words, sort = TRUE)
# Adiciona vetores
vocab <- left_join(vocab, nilc_wv)
# vetor lógico indicando palavras não encontradas nos vetores
select <- vocab %>% select(V1) %>% is.na %>% as.logical()
select <- !select
# Análise de 100 clusters
d <- dist(vocab[select , 3:302], method="euclidean")
cluster <- hclust(d, method="ward.D2")
plot(cluster)
grp200 <- cutree(cluster, k = 200)
# adiciona clusters
vocab <- vocab %>% filter(select) %>% select(1:2) %>% bind_cols(as.data.frame(grp200))
# leva cluster para a base
dt_cx2 <- dt_cx2[, 1:7]
dt_cx2 <- dt_cx2 %>% left_join(vocab)
# transforma cluster em dummies
dt_cx2 <- bind_cols(dt_cx2, as.data.frame(dummy.code(dt_cx2$grp200)))
names(dt_cx2)
# cria base por resposta
dt_cx3 <- dt_cx2 %>% select(c(1, 2, 4, 10:209)) %>%
group_by(ID, resp_num, pre_pos) %>%
summarise_all(.funs = sum, na.rm=TRUE)
names(dt_cx3)
# Análise de 40 clusters das respostas
d <- dist( dt_cx3[, 4:203], method="binary")
cluster <- hclust(d, method="ward.D2")
plot(cluster)
grp60 <- cutree(cluster, k = 60)
dt_cx3 <- bind_cols(dt_cx3, as.data.frame(grp60))
names(dt_cx3)
dt_cx <- left_join(dt_cx[, 1:5], dt_cx3[, c(1:3, 204)])
library(Rtsne)
library(ggrepel)
library(ggthemes)
library(RColorBrewer)
library(artyfarty)
# Lê word embeddings
nilc_wv <- read_delim(
file = "glove_s300.txt",
delim = " ",quote="",
skip = 1,
col_names = FALSE,
progress = TRUE)
names(nilc_wv)[1]<-"words"
names(nilc_wv)[2:301]<-paste("V", 1:300, sep= "")
vocab <- vocab[ , 1:3]
vocab <- left_join(vocab, nilc_wv)
select <- vocab %>% select(V1) %>% is.na %>% as.logical()
select <- !select
names(vocab)
tsne_out <- Rtsne(vocab[select , 4:303], perplexity = 18)
vocab <- cbind(vocab[select, 1:3] , as.data.frame(tsne_out$Y))
names(vocab)
ggplot(data = vocab,
mapping = aes(
y = V1,
x = V2,
color = grp60)
) +
geom_point() +
geom_text_repel(
aes(label=words),
size=2, vjust=-1.2
) +
theme_minimal() +
scale_color_gradientn(colours = brewer.pal(12, "Paired"))
# cluster via ts-ne
d <- dist( vocab[, 4:5], method="euclidian")
cluster <- hclust(d, method="ward.D2")
plot(cluster)
grp60 <- cutree(cluster, k = 60)
# adiciona clusters
vocab <- vocab %>% filter(select) %>% select(1:2) %>% bind_cols(as.data.frame(grp60))
names(dt_cx2)
# leva cluster para a base
dt_cx2b <- dt_cx2[, c(1:7)] %>% left_join(vocab)
# transforma cluster em dummies
dt_cx2b <- bind_cols(dt_cx2b, as.data.frame(dummy.code(dt_cx2b$grp60)))
names(dt_cx2b)
# cria base por resposta
dt_cx3b <- dt_cx2b %>% select(c(1, 2, 4, 10:49)) %>%
group_by(ID, resp_num, pre_pos) %>%
summarise_all(.funs = sum, na.rm=TRUE)
names(dt_cx3b)
# Análise de 40 clusters das respostas
d <- dist( dt_cx3[, 4:69], method="binary")
cluster <- hclust(d, method="ward.D2")
plot(cluster)
grp60b <- cutree(cluster, k = 60)
names(dt_cx3b)
dt_cx3b <- bind_cols(dt_cx3b, as.data.frame(grp40b))
dt_cx <- left_join(dt_cx[, 1:6], dt_cx3b[, c(1:3, 44)])
table(dt_cx$grp40, dt_cx$grp40b)
table(dt_cx$grp40)
prep_fun = tolower
tok_fun = word_tokenizer
it = itoken(dt_cx$resposta,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = dt_cx$ID,
progressbar = FALSE)
vocab = create_vocabulary(
it,
stopwords = stopwords$words,
) %>%
prune_vocabulary(doc_proportion_max = 0.60, term_count_min = 2)
vectorizer = vocab_vectorizer(vocab)
dtm = create_dtm(it, vectorizer)
dim(dtm)
# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_tfidf = fit_transform(dtm, tfidf)
lsa = LSA$new(n_topics = 20)
doc_embeddings = dtm %>%
fit_transform(tfidf) %>%
fit_transform(lsa)
dim(doc_embeddings)
dim(lsa$components)
# Análise de cluster
library(NbClust)
library(factoextra)
s <- !duplicated(doc_embeddings)
as.data.frame(doc_embeddings) %>%
filter(s) %>%
fviz_nbclust(x = .,
FUNcluster = hcut,
method = c("wss"), k.max = 60)
table(dt_cx$grp32)
s <- !duplicated(doc_embeddings) & dt_cx$grp32==5
as.data.frame(doc_embeddings) %>%
filter(s) %>%
fviz_nbclust(x = .,
FUNcluster = hcut,
method = c("wss"), k.max = 40)
s <- dt_cx$grp32==5
d <- as.data.frame(doc_embeddings) %>% filter(s) %>% dist(method="euclidean")
cluster <- hclust(d, method="ward.D2")
plot(cluster)
grp10 <- cutree(cluster, k = 10)
dt_cx_cat5 <- dt_cx %>% filter(grp32==5) %>% cbind(., grp10)
table(dt_cx_cat5$grp10)