Text Mining with tidytext

Dados

  setwd("~/Dropbox (Personal)/Ciencia de dados")

  usos <- read_excel("usos.xlsx", sheet = "Usos") 
  
  

  # Atividade 1: caixa de papelão
  # Atividade 2: tijolos

Descreve dados

# Seleciona atividade 1    
   dt_cx <- usos %>% filter(ativi==1 & !is.na(resposta)) %>% select(1:5)

# Calcula fluência e examina distribuiçào
    dt_cx %>% group_by(ID, pre_pos) %>% count %>% 
      arrange(ID, pre_pos) %>% filter(n <25) %>%
      ggplot(aes(x=n)) + 
      geom_histogram(binwidth = 2, color = "white", fill = "gray") +
      scale_x_continuous(breaks = seq(1:25), limits = c(1, 25))

# Correlação entre fluência    
    dt_cx %>% group_by(ID, pre_pos) %>% count %>% 
        arrange(ID, pre_pos) %>% filter(n <25) %>%
        spread(key = pre_pos, value = n) %>%
        ggplot(aes(x=`1`, y=`2`)) + geom_point()

Tokenize

# Tokenize palvras 
  dt_cx2 <- dt_cx %>% unnest_tokens(words, resposta)

length(unique(dt_cx2$words))

## [1] 1242

# Lê stopwords

  stopwords <- read_csv(
    file = "http://www.labape.com.br/rprimi/ds/stopwords.txt", 
    col_names = FALSE)
  
  names(stopwords) = "words"

Frequência de palavras

    dt_cx2 %>% count(words, sort = TRUE)

## # A tibble: 1,242 x 2
##    words       n
##    <chr>   <int>
##  1 de        760
##  2 para      252
##  3 fazer     212
##  4 guardar   198
##  5 caixa     173
##  6 uma       144
##  7 papelão   141
##  8 casa      139
##  9 um        138
## 10 casinha   113
## # ... with 1,232 more rows

# Palavras gerais   
    dt_cx2 %>%
        count(words, sort = TRUE) %>%
        filter(n > 50 ) %>%
        mutate(words = reorder(words, n)) %>%
       ggplot(aes(words, n)) +
        geom_col() +
        xlab(NULL) +
        coord_flip()

# Remove stopwords
     dt_cx2 %>%
        count(words, sort = TRUE) %>%
        anti_join(stopwords) %>%
        filter(n > 15)  %>%
        mutate(word = reorder(words, n)) %>%
        ggplot(aes(word, n)) +
        geom_col() +
        xlab(NULL) +
        coord_flip()

# Word cloud   
   library(wordcloud)

    dt_cx2 %>%
        anti_join(stopwords) %>%
        count(words) %>%
        with(wordcloud(words, n, 
          colors = brewer.pal(12, "Set1"),
            max.words = 100))

    stopwords[221, "words"] <- "pra"
    stopwords[222, "words"] <- "dá"

Teste da hipótese “serial order effect”

Veja: Beaty, R. E., & Silvia, P. J. (2012). Why do ideas get more creative across time? An executive interpretation of the serial order effect in divergent thinking tasks. Psychology of Aesthetics, Creativity, and the Arts, 6(4), 309-319. http://dx.doi.org/10.1037/a0029171

  word_freq <- dt_cx2 %>%
        count(words, sort = TRUE)

  dt_cx2 <- dt_cx2 %>% left_join(word_freq)
  
  names(dt_cx2 )[6] <- "frq"
  
  dt_cx2 %>% 
    ggplot(aes(x=resp_num)) + 
    geom_histogram(
      aes(y = (..count..)/sum(..count..), group = 1),
      binwidth = 2, 
      color = "white", 
      fill = "gray"
      )

   dt_cx2 <- dt_cx2 %>% 
    mutate(
     resp_num2 =
      case_when(
      resp_num <=5 ~ 1,
      resp_num > 5 & resp_num <= 10  ~ 2,
      resp_num > 10  ~ 3
       )
      )
   
   table(dt_cx2$resp_num, dt_cx2$resp_num2)

##     
##        1   2   3
##   1  729   0   0
##   2  723   0   0
##   3  634   0   0
##   4  630   0   0
##   5  576   0   0
##   6    0 559   0
##   7    0 488   0
##   8    0 443   0
##   9    0 383   0
##   10   0 353   0
##   11   0   0 265
##   12   0   0 228
##   13   0   0 174
##   14   0   0 136
##   15   0   0 117
##   16   0   0 102
##   17   0   0 104
##   18   0   0  66
##   19   0   0  56
##   20   0   0  46
##   21   0   0  53
##   22   0   0  36
##   23   0   0  34
##   24   0   0  32
##   25   0   0  22
##   26   0   0   1

    dt_cx2 %>% 
      anti_join(stopwords) %>%
      ggplot(aes(x=frq, fill = as.factor(resp_num2))) +
      geom_histogram(
      aes(y = (..count..)/sum(..count..), group = 1)) +
      facet_grid(resp_num2~.) +
      scale_fill_brewer(palette = "Set2")

     dt_cx2 %>% 
      anti_join(stopwords) %>%
      ggplot(aes(y=frq, x=resp_num2, fill = as.factor(resp_num2))) +
      geom_boxplot(alpha=1/2)  +
      scale_fill_brewer(palette = "Set1")

Exercício: faça um gráfico de frequencia (x) das palavras (y) em cada momento (1, 2 e 3)

Quão correlacionados os desempenhos no pré e pos teste ?

    fr_pre_pos <-  dt_cx2 %>%
        count(pre_pos, words, sort = TRUE) %>%
        anti_join(stopwords) %>%
        group_by(pre_pos) %>%
        mutate(soma = sum(n),
          prop = n / sum(n)
          
          ) %>% 
        select(-n, -soma) %>% 
        spread(pre_pos, prop) %>%
        mutate(
            `1` = ifelse(is.na(`1`), 0 ,`1`),
            `2` = ifelse(is.na(`2`), 0 ,`2`))
    
     
      
       ggplot(fr_pre_pos, aes(x = `1`, y = `2`)) +
        geom_abline(color = "gray40", lty = 2) +
        geom_jitter(alpha = 0.1, size = 2.5) +
        geom_text(aes(label = words), check_overlap = FALSE, just = 1.5) +
       
        scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
        theme(legend.position="none")

        corr.test(fr_pre_pos[ , 2:3])

## Call:corr.test(x = fr_pre_pos[, 2:3])
## Correlation matrix 
##      1    2
## 1 1.00 0.85
## 2 0.85 1.00
## Sample Size 
## [1] 1178
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##   1 2
## 1 0 0
## 2 0 0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

        ggplot(fr_pre_pos, aes(x = `1`, y = `2`)) +
            geom_point(alpha=1/20) +
            geom_text(mapping = aes(label = words), check_overlap = TRUE, vjust = 1.5)

Term’s inverse document frequency (idf)

Term’s inverse document frequency (idf), which decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents. This can be combined with term frequency to calculate a term’s tf-idf (the two quantities multiplied together), the frequency of a term adjusted for how rarely it is used (p. 31)

The idea of tf-idf is to find the important words for the content of each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection or corpus of documents (p. 37)

\[idf_{termo} = \mbox{ln} \frac{n_{documentos}}{n_{documentos.contendo.termo}}\]

# Calcula frequencia de palavras por sujeito (document)
 subj_words <- dt_cx2 %>%
  count(ID, words, sort = TRUE) %>%
  ungroup()

# Calcula total de palavras por sujeito
  total_words <- subj_words %>% 
    group_by(ID) %>% 
    summarize(total = sum(n))


# Une total de palavras em subj_words
  subj_words <- left_join(subj_words, total_words)

# Calcula frequência de palavras por sujeito  
  subj_words <- subj_words %>% 
    mutate(term_freq = n/total)

 # Calcula número de documentos  
  subj_words <- subj_words %>%
    mutate(n_docs = n_distinct(ID))
  
 # Calcula em quandos documentos cada palavra aparece (document frequency)
  
   subj_words <- subj_words %>%
    group_by(words) %>%
    mutate(word_doc_freq = n_distinct(ID)/n_docs,
      inv_word_doc_freq = n_docs/n_distinct(ID),
      ln_inv_word_doc_freq =log(n_docs/n_distinct(ID)))
   
   subj_words <-  subj_words %>% filter(words !="0")

Explora tf e idf

  subj_words %>%
    group_by(words) %>%
    summarise(word_doc_freq = mean(word_doc_freq)) %>%
    arrange(desc(word_doc_freq)) %>%
    mutate(words = reorder(words, word_doc_freq)) %>%
    filter(word_doc_freq > .08) %>%
    
    ggplot(aes(words, word_doc_freq)) +
      geom_col(aes(fill = word_doc_freq)) +
      xlab(NULL) + coord_flip() +
      scale_fill_gradientn(
        colours = brewer.pal(5 ,"Paired")
        )

  subj_words %>%
    group_by(words) %>%
    summarise(word_doc_freq = mean(word_doc_freq)) %>%
    anti_join(stopwords) %>%
    arrange(desc(word_doc_freq)) %>%
    mutate(words = reorder(words, word_doc_freq)) %>%
    filter(word_doc_freq > .08) %>%
    
    ggplot(aes(words, word_doc_freq)) +
      geom_col(aes(fill = word_doc_freq)) +
      xlab(NULL) +
      coord_flip() +
      scale_fill_gradientn(
        colours = brewer.pal(5 ,"Paired")
        )

   subj_words %>%
     ggplot(aes(
       x = word_doc_freq,
       y = term_freq , 
       color = inv_word_doc_freq
       )) +
     geom_point(alpha=1/4) +
     theme_minimal() +
      geom_text( aes(label = words), 
        check_overlap = TRUE, 
        vjust = 1.5,
        size=3
        ) +
      scale_fill_gradientn(
        colours = brewer.pal(5 ,"Paired")
        )

    subj_words %>%
      select(2:9) %>%
      anti_join(stopwords) %>%  
      group_by(words) %>%
     summarise_all(.funs = mean) %>% 
      ggplot(aes(
       x = word_doc_freq,
       y = term_freq , 
       color = inv_word_doc_freq
       )) +
     geom_point(alpha=1/4) +
     theme_minimal() +
      geom_text( aes(label = words), 
        check_overlap = TRUE, 
        vjust = 1.5,
        size=3
        ) +
      scale_fill_gradientn(
        colours = brewer.pal(5 ,"Paired")
        )

Componentes Term Frequency Inverse Document Frequency (TF_IDF)

  subj_words %>%
     ggplot(aes(
       x = word_doc_freq   ,
       y = inv_word_doc_freq , 
       color = inv_word_doc_freq
       )) +
     geom_point(alpha=1/4) +
     theme_minimal() +
      scale_fill_gradientn(
        colours = brewer.pal(5 ,"Paired")
        )

subj_words %>%
     ggplot(aes(
       x =  word_doc_freq    ,
       y = ln_inv_word_doc_freq , 
       color = inv_word_doc_freq
       )) +
     geom_point(alpha=1/4) +
     theme_minimal() +
      scale_fill_gradientn(
        colours = brewer.pal(5 ,"Paired")
        )

Adicionando tf_tdf automaticamente

  subj_words <- subj_words %>%
    bind_tf_idf(term = words, document = ID, n=n)

dtm <-  subj_words %>% select(c(1, 2, 12)) %>%
  spread(key=words, value = tf_idf)

subj_words %>%
     ggplot(aes(
       x =  tf   ,
       y = tf_idf , 
       color = word_doc_freq  
       )) +
     geom_point(alpha=1/4) +
     theme_minimal() +
      scale_fill_gradientn(
        colours = brewer.pal(5 ,"Paired")
        ) +
    geom_text( aes(label = words), 
        check_overlap = TRUE, 
        vjust = 1.5,
        size=3
        )

 source("http://www.labape.com.br/rprimi/R/cria_quartis.R")

table(cut(subj_words$word_doc_freq, breaks = c(0, 0.003875969, 0.011627907, 0.399224806)))

## 
##      (0,0.00388] (0.00388,0.0116]   (0.0116,0.399] 
##              712              617             3288

subj_words %>%
    select(2:12) %>%
     anti_join(stopwords) %>%  
     group_by(words) %>%
     summarise_all(.funs = mean) %>%
     ungroup() %>%
     mutate(grp_wf = cut(word_doc_freq, breaks = c(0, 0.003, 0.011, 0.20))) %>%
    ggplot(aes(
       x =  tf   ,
       y = tf_idf , 
       color = word_doc_freq  
       )) +
     geom_point(alpha=1/4) +
     theme_minimal() +
      scale_fill_gradientn(
        colours = brewer.pal(5 ,"Paired")
        ) +
    geom_text( aes(label = words), 
        check_overlap = TRUE, 
        vjust = 1.5,
        size=3
        ) +
  facet_grid(.~grp_wf)

 subj_words %>%
    arrange(desc(tf_idf))

## # A tibble: 5,027 x 12
## # Groups:   words [1,241]
##        ID words     n total term_freq n_docs word_doc_freq inv_word_doc_fr…
##     <dbl> <chr> <int> <int>     <dbl>  <int>         <dbl>            <dbl>
##  1 1.12e8 brin…     1     1     1        258       0.0271              36.9
##  2 1.10e8 cabe     22    50     0.44     258       0.00388            258  
##  3 1.08e8 préd…     1     2     0.5      258       0.00775            129  
##  4 1.10e8 cari…     1     2     0.5      258       0.00775            129  
##  5 1.11e8 pôr       5    18     0.278    258       0.00388            258  
##  6 1.06e8 casas     1     2     0.5      258       0.0465              21.5
##  7 1.08e8 casas     1     2     0.5      258       0.0465              21.5
##  8 1.09e8 casas     1     2     0.5      258       0.0465              21.5
##  9 1.11e8 metal     2     8     0.25     258       0.00388            258  
## 10 1.07e8 cont…     1     4     0.25     258       0.00388            258  
## # ... with 5,017 more rows, and 4 more variables:
## #   ln_inv_word_doc_freq <dbl>, tf <dbl>, idf <dbl>, tf_idf <dbl>

  subj_words %>%
    ggplot(aes(x=tf , y= tf_idf, color = word_doc_freq)) + 
    geom_point(alpha=1/4) +
    scale_colour_gradientn(colours = brewer.pal(7, "Paired")) +
    scale_y_continuous(seq(0, 2.5, .5), limits=c(0, 2.5))

  temp <- subj_words %>%
    arrange(ID, desc(tf_idf)) %>% 
     group_by(ID) %>% 
     top_n(10) %>% 
    mutate(word = forcats::fct_reorder(as.factor(words), tf_idf)) 
   
 
    
  temp[1:60, ] %>% 
  ggplot(aes(word, tf_idf, fill = ID)) +
    geom_col(show.legend = FALSE) +
    labs(x = NULL, y = "tf-idf") +
    facet_wrap(~ID, ncol = 2, scales = "free") +
    coord_flip()

Latent Semantic Analyais via Topic modeling (Latent Dirichlet Allocation - LDA )

Usaremos o pacote text2vec
Site: http://text2vec.org/topic_modeling.html
Usando palavras

  # install.packages('servr') 
  library(stringr)
  library(text2vec)
 
  
  prep_fun <- tolower
  tok_fun <- word_tokenizer

  it = itoken(
      dt_cx$resposta, 
      ids = dt_cx$ID, 
      preprocessor = prep_fun, 
      tokenizer = tok_fun, 
      progressbar = FALSE)
  
  vocab = 
    create_vocabulary(it,
      stopwords = stopwords$words) %>%
    
    prune_vocabulary(
      term_count_min = 1, 
      doc_proportion_max = 0.8
      )

  vectorizer = vocab_vectorizer(vocab)
  dtm = create_dtm(it, vectorizer)
  
  dim(dtm)

## [1] 3082 1178

  # define tfidf model
  tfidf = TfIdf$new()
  # fit model to train data and transform train data with fitted model
  dtm_tfidf = fit_transform(dtm, tfidf)

 lda_model = LDA$new(
   n_topics = 12, 
   doc_topic_prior = 0.1,
   topic_word_prior = 0.01
   )

  doc_topic_distr = 
    lda_model$fit_transform(
    x = dtm_tfidf, 
    n_iter = 1000, 
    convergence_tol = 0.001, 
    n_check_convergence = 25, 
    progressbar = FALSE
    )

## INFO [2018-11-25 12:58:48] iter 25 loglikelihood = -56647.167
## INFO [2018-11-25 12:58:48] iter 50 loglikelihood = -54650.540
## INFO [2018-11-25 12:58:49] iter 75 loglikelihood = -54131.180
## INFO [2018-11-25 12:58:49] iter 100 loglikelihood = -53923.703
## INFO [2018-11-25 12:58:49] iter 125 loglikelihood = -53826.124
## INFO [2018-11-25 12:58:49] iter 150 loglikelihood = -53789.327
## INFO [2018-11-25 12:58:49] early stopping at 150 iteration

  lda_model$get_top_words(n = 12, 
    topic_number = c(1:12), lambda = 0.2)

##       [,1]           [,2]        [,3]         [,4]         [,5]     
##  [1,] "mesa"         "desenhar"  "maquete"    "lixo"       "armário"
##  [2,] "celular"      "geladeira" "tapete"     "roupa"      "boneco" 
##  [3,] "lousa"        "mochila"   "calendário" "guarda"     "letras" 
##  [4,] "escudo"       "chinelo"   "arma"       "reciclar"   "janela" 
##  [5,] "casas"        "aviao"     "cartaz"     "lápis"      "barco"  
##  [6,] "mentira"      "carregar"  "livro"      "armario"    "relogio"
##  [7,] "sapatos"      "cesta"     "desenho"    "computador" "bonecos"
##  [8,] "bola"         "fogão"     "garrafa"    "gol"        "anel"   
##  [9,] "recortar"     "fogao"     "óculos"     "máscara"    "cabe"   
## [10,] "escorregador" "mudança"   "estante"    "vaso"       "martelo"
## [11,] "ventilador"   "caixinha"  "banquinho"  "fotos"      "vender" 
## [12,] "construir"    "jogo"      "tabuleiro"  "mudanças"   "tijolo" 
##       [,6]        [,7]       [,8]         [,9]          [,10]       
##  [1,] "casa"      "avião"    "brinquedos" "carro"       "casinha"   
##  [2,] "cadeira"   "relógio"  "caderno"    "castelo"     "sapato"    
##  [3,] "boneca"    "robô"     "estojo"     "bolsa"       "roupas"    
##  [4,] "cachorro"  "árvore"   "cama"       "capacete"    "escorregar"
##  [5,] "caminhão"  "enfeites" "faca"       "televisão"   "livros"    
##  [6,] "flor"      "casinhas" "pintura"    "esconder"    "grama"     
##  [7,] "maquiagem" "forte"    "mesinha"    "bonecas"     "gato"      
##  [8,] "cartas"    "colar"    "brinco"     "quadro"      "prateleira"
##  [9,] "brincar"   "gaiola"   "quadros"    "banco"       "barbie"    
## [10,] "pratos"    "compras"  "alfabeto"   "parede"      "pasta"     
## [11,] "escola"    "natal"    "colher"     "esconderijo" "placas"    
## [12,] "cofre"     "apoiador" "robo"       "sofá"        "jogar"     
##       [,11]      [,12]       
##  [1,] "espada"   "carrinho"  
##  [2,] "armadura" "foguete"   
##  [3,] "tv"       "enfeite"   
##  [4,] "arminha"  "reciclagem"
##  [5,] "prédio"   "papel"     
##  [6,] "lapis"    "caneta"    
##  [7,] "carteira" "pote"      
##  [8,] "cartazes" "cozinha"   
##  [9,] "escada"   "lixinho"   
## [10,] "nave"     "desenhos"  
## [11,] "lousinha" "carrinhos" 
## [12,] "objetos"  "dinheiro"

  lda_model$plot()

LDA usando bigramas 2 a 3

  library(stringr)
  library(text2vec)
  
  prep_fun = tolower
  tok_fun = word_tokenizer

  it = itoken(dt_cx$resposta, 
             preprocessor = prep_fun, 
             tokenizer = tok_fun, 
             ids = dt_cx$ID, 
             progressbar = FALSE)
  
  vocab = create_vocabulary(
    it, 
    stopwords = stopwords$words,
    ngram = c(3, 3)
    ) %>%
    prune_vocabulary(
      term_count_min = 1, 
      doc_proportion_max = 0.8)
  
  dim(vocab)

## [1] 507   3

  vectorizer = vocab_vectorizer(vocab)
  dtm = create_dtm(it, vectorizer)
  
  dim(dtm)

## [1] 3082  507

  # define tfidf model
  tfidf = TfIdf$new()
  # fit model to train data and transform train data with fitted model
  dtm_tfidf = fit_transform(dtm, tfidf)

  dim(dtm_tfidf )

## [1] 3082  507

 lda_model = LDA$new(
   n_topics =8, 
   doc_topic_prior = 0.1,
   topic_word_prior = 0.01
   )

  doc_topic_distr = 
    lda_model$fit_transform(
    x = dtm_tfidf, n_iter = 1000, 
    convergence_tol = 0.001, 
    n_check_convergence = 25, 
    progressbar = FALSE
    )

## INFO [2018-11-25 12:58:49] iter 25 loglikelihood = -10279.961
## INFO [2018-11-25 12:58:49] iter 50 loglikelihood = -10282.998
## INFO [2018-11-25 12:58:49] early stopping at 50 iteration

  barplot(
    doc_topic_distr[2, ], xlab = "topic", 
    ylab = "proportion", ylim = c(0, 1), 
    names.arg = 1:ncol(doc_topic_distr)
    )

  lda_model$get_top_words(n = 12, 
    topic_number = c(1:4), lambda = 0.2)

##       [,1]                         [,2]                           
##  [1,] "dar_moradores_rua"          "caixa_guardar_coisas"         
##  [2,] "brincar_nave_espacial"      "fazer_carrinho_papelão"       
##  [3,] "fazer_porta_celular"        "carrinho_roleman_papelao"     
##  [4,] "fazer_casinha_polly"        "caixa_presente_papelão"       
##  [5,] "capa_caderno_papelão"       "fazer_casinha_papelão"        
##  [6,] "reciclar_todos_tamanhos"    "fazer_tv_brincar"             
##  [7,] "colocar_coisa_velha"        "orelha_coelho_páscoa"         
##  [8,] "casa_caixa_papelão"         "fazer_carro_brincar"          
##  [9,] "b_onecas_papelão"           "fazer_brinquedos_infantis"    
## [10,] "guardar_pessoa_carro"       "usar_manobras_skate"          
## [11,] "melhor_fazer_brinquedos"    "fazer_ferramentas_brincadeira"
## [12,] "armadura_brincar_guerreiro" "caixa_porta_sapatos"          
##       [,3]                       [,4]                       
##  [1,] "caixa_guardar_brinquedos" "guardar_brinquedos_coisas"
##  [2,] "guardar_coisas_antigas"   "fazer_casinha_barbie"     
##  [3,] "fazer_casinha_gato"       "guardar_filmes_dvd's"     
##  [4,] "forrar_chão_pintar"       "mesinha_colo_papelao"     
##  [5,] "utilizar_sapatos_dentro"  "voce_mudar_casa"          
##  [6,] "montar_tv_brinquedo"      "pode_escorregar_papelão"  
##  [7,] "guardar_arvore_natal"     "caixa_anel_casamento"     
##  [8,] "caixa_guardar_roupa"      "capa_caderno_papelao"     
##  [9,] "fazer_trabalho_escola"    "fazer_cabelo_bonecos"     
## [10,] "cartazes_anunciando_algo" "colocar_coisas_mudança"   
## [11,] "fazer_enfeites_festas"    "caixa_guardar_garrafa"    
## [12,] "caminha_cachorro_dormir"  "caixa_doação_roupas"

  lda_model$plot()

Classificação de idéias usando word embeddings

Word embeddings da base de vetores em português do NILC (http://www.nilc.icmc.usp.br/nilc/index.php/repositorio-de-word-embeddings-do-nilc#)

    library(readr)
  
 # Lê word embeddings
  nilc_wv <- read_delim(
    file = "glove_s300.txt", 
    delim = " ",quote="", 
    skip = 1,  
    col_names = FALSE, 
    progress = TRUE)
  
  names(nilc_wv)[1]<-"words"
  names(nilc_wv)[2:301]<-paste("V", 1:300, sep= "")
  
 
  
  # Lê word embeddings de 600d
  # load("~/Documents/word_vectors/nilc_wv_glove_600d.RData")
  
  
  # cria vocabulário
  vocab <- dt_cx2 %>% count(words, sort = TRUE)
  
  # Adiciona vetores
  vocab <- left_join(vocab, nilc_wv)
  
  # vetor lógico indicando palavras não encontradas nos vetores
  select <- vocab %>% select(V1) %>% is.na %>% as.logical()
  select <- !select
  
  # Análise de 100 clusters
  d   <- dist(vocab[select , 3:302], method="euclidean") 
  cluster  <- hclust(d, method="ward.D2")
  plot(cluster)
  grp200 <- cutree(cluster, k = 200)
  
  # adiciona clusters
  vocab <- vocab %>% filter(select) %>% select(1:2) %>% bind_cols(as.data.frame(grp200))
  
  # leva cluster para a base
  dt_cx2 <- dt_cx2[, 1:7]
  dt_cx2 <- dt_cx2 %>% left_join(vocab)
  
  # transforma cluster em dummies
  dt_cx2 <- bind_cols(dt_cx2, as.data.frame(dummy.code(dt_cx2$grp200)))
  
  names(dt_cx2)
  
  # cria base por resposta
  dt_cx3 <-  dt_cx2 %>% select(c(1, 2, 4, 10:209)) %>%
    group_by(ID, resp_num, pre_pos) %>%
    summarise_all(.funs = sum, na.rm=TRUE)
  
   names(dt_cx3)
   # Análise de 40 clusters das respostas 
  d   <- dist( dt_cx3[, 4:203], method="binary") 
  cluster  <- hclust(d, method="ward.D2")
  plot(cluster)
  grp60 <- cutree(cluster, k = 60)
   
  dt_cx3 <- bind_cols(dt_cx3, as.data.frame(grp60))
  names(dt_cx3) 
  dt_cx <- left_join(dt_cx[, 1:5], dt_cx3[, c(1:3, 204)])

Visualizando clusters usando tsn-e

  library(Rtsne)
  library(ggrepel)
  library(ggthemes)
  library(RColorBrewer)
  library(artyfarty)
 
 # Lê word embeddings
  nilc_wv <- read_delim(
    file = "glove_s300.txt", 
    delim = " ",quote="", 
    skip = 1,  
    col_names = FALSE, 
    progress = TRUE)
  
  names(nilc_wv)[1]<-"words"
  names(nilc_wv)[2:301]<-paste("V", 1:300, sep= "")

  vocab <- vocab[ , 1:3]
  vocab <- left_join(vocab, nilc_wv)
  select <- vocab %>% select(V1) %>% is.na %>% as.logical()
  select <- !select 
  
  
  names(vocab)
 
 tsne_out <- Rtsne(vocab[select , 4:303], perplexity = 18) 
 
 vocab  <-  cbind(vocab[select, 1:3] , as.data.frame(tsne_out$Y))
names(vocab)

  ggplot(data = vocab,  
            mapping = aes(
             y = V1,
             x = V2,
            color = grp60) 
           ) +
          
         geom_point()  +
                  
         geom_text_repel(
            aes(label=words), 
            size=2, vjust=-1.2
            ) +
       theme_minimal() +
      scale_color_gradientn(colours =  brewer.pal(12, "Paired"))
  
  
  # cluster via ts-ne
  d   <- dist( vocab[, 4:5], method="euclidian") 
  cluster  <- hclust(d, method="ward.D2")
  plot(cluster)
  grp60 <- cutree(cluster, k = 60)
  
   # adiciona clusters
  vocab <- vocab %>% filter(select) %>% select(1:2) %>% bind_cols(as.data.frame(grp60))
  
  names(dt_cx2)
  # leva cluster para a base
  dt_cx2b <- dt_cx2[, c(1:7)] %>% left_join(vocab)
  
  # transforma cluster em dummies
  dt_cx2b <- bind_cols(dt_cx2b, as.data.frame(dummy.code(dt_cx2b$grp60)))
  
  names(dt_cx2b)
  
  # cria base por resposta
  dt_cx3b <-  dt_cx2b %>% select(c(1, 2, 4, 10:49)) %>%
    group_by(ID, resp_num, pre_pos) %>%
    summarise_all(.funs = sum, na.rm=TRUE)
  
   names(dt_cx3b)
   # Análise de 40 clusters das respostas 
  d   <- dist( dt_cx3[, 4:69], method="binary") 
  cluster  <- hclust(d, method="ward.D2")
  plot(cluster)
  grp60b <- cutree(cluster, k = 60)
  
  names(dt_cx3b) 
  dt_cx3b <- bind_cols(dt_cx3b, as.data.frame(grp40b))
   
  dt_cx <- left_join(dt_cx[, 1:6], dt_cx3b[, c(1:3, 44)])
  
  
  table(dt_cx$grp40, dt_cx$grp40b)
  
  table(dt_cx$grp40)

Miscelaneous

  prep_fun = tolower
  tok_fun = word_tokenizer

  it = itoken(dt_cx$resposta, 
             preprocessor = prep_fun, 
             tokenizer = tok_fun, 
             ids = dt_cx$ID, 
             progressbar = FALSE)
  
  vocab = create_vocabulary(
    it, 
    stopwords = stopwords$words,
    ) %>%
    prune_vocabulary(doc_proportion_max = 0.60, term_count_min = 2)

  vectorizer = vocab_vectorizer(vocab)
  dtm = create_dtm(it, vectorizer)
  
  dim(dtm)

  # define tfidf model
  tfidf = TfIdf$new()
  # fit model to train data and transform train data with fitted model
  dtm_tfidf = fit_transform(dtm, tfidf)

  lsa = LSA$new(n_topics = 20)
  
  doc_embeddings = dtm %>% 
    fit_transform(tfidf) %>% 
    fit_transform(lsa)

  dim(doc_embeddings)
  dim(lsa$components)
  
 # Análise de cluster
 library(NbClust)
 library(factoextra)
  
  
  s <- !duplicated(doc_embeddings)
  
  as.data.frame(doc_embeddings) %>% 
    filter(s) %>%
    fviz_nbclust(x = ., 
     FUNcluster = hcut, 
     method = c("wss"), k.max = 60)
 
    table(dt_cx$grp32)
    
    s <- !duplicated(doc_embeddings) & dt_cx$grp32==5
    
    as.data.frame(doc_embeddings) %>% 
    filter(s) %>%
    fviz_nbclust(x = ., 
     FUNcluster = hcut, 
     method = c("wss"), k.max = 40)
    
  
    s <- dt_cx$grp32==5
    d   <-  as.data.frame(doc_embeddings) %>% filter(s) %>% dist(method="euclidean") 
    cluster  <- hclust(d, method="ward.D2")
    plot(cluster)
    
    grp10 <- cutree(cluster, k = 10)
    
     dt_cx_cat5 <-  dt_cx %>% filter(grp32==5) %>% cbind(., grp10)
      
      table(dt_cx_cat5$grp10)