Keep all the text phrases for data frequency

102 Views Asked by At

I have a data frame with only one column "text"

"text"
"User Interfaces"
"Twitter"
"Text Normalization"
"Term weighting"
"Teenagers"
"Team member replacement"

I would like to take a dataframe with the frequency of every phrase, like this:

 "User Interfaces",1
 "Twitter",1
 "Text Normalization",1
 "Term weighting",1
 "Teenagers",1
 "Team member replacement",1

in order to make it I use this:

library(tm) 
df <- read.csv("C:/Users/acel/Desktop/myphr.csv", header=TRUE, sep=",")
corpusD <- Corpus(VectorSource(df$text))
corpusD <- tm_map(corpusD, tolower)
corpusD <- tm_map(corpusD, removeWords, stopwords('english'))
corpusD <- tm_map(corpusD, removeNumbers)
corpusD <- tm_map(corpusD, stripWhitespace)
corpusD <- tm_map(corpusD, PlainTextDocument)
corpusD <- tm_map(corpusD, stemDocument, language = "english")
corpusC <- Corpus(VectorSource(corpusD))
matrixD <- TermDocumentMatrix(corpusC)
matrixD <- removeSparseTerms(matrixD, 0.75)
MatrixDfreq <- rowSums(as.matrix(matrixD))
MatrixDfreq<-sort(MatrixDfreq, decreasing = TRUE)
MatrixDtop30<- MatrixDfreq [1:30]

but when I check the result from MatrixDtop30 I see one word counted like user,1 and interface,1 instead of seeing "user interface",1

Any idea why this is happening?

2

There are 2 best solutions below

4
On BEST ANSWER

I think this would be a lot easier using data.table operations.

library(data.table)
df = data.frame(text = c("test", "test" ,"test" , "test2", "test3", "test2"))

> df
   text
1  test
2  test
3  test
4 test2
5 test3
6 test2

setDT(df)
df = df[ , .(Number = .N), by = .(text)]

> df
    text Number
1:  test      3
2: test2      2
3: test3      1

Edit

We can include stemming with this

library(data.table)
library(SnowballC)
df = data.frame(text = c("test", "testing" ,"test" , "test2", "test3", "test2"))

> df
     text
1    test
2 testing
3    test
4   test2
5   test3
6   test2

df$text = wordStem(df$text, language = "porter")

> df
   text
1  test
2  test
3  test
4 test2
5 test3
6 test2

setDT(df)
df = df[ , .(Number = .N), by = .(text)]

> df
    text Number
1:  test      3
2: test2      2
3: test3      1
0
On

In the example output you have it doesn't look like you're performing any transformations on the text such as lowercasing or removing stopwords and are just keeping the phrases as is? If so you can easily count the number of unique phrases using the tidyverse.

library(dplyr)
library(readr)

df <- data_frame(text = c("User Interfaces", "Twitter", "Text Normalization", "Term weighting", "Teenagers", "Team member replacement")
count(df, text)
                     text     n
                    <chr> <int>
1 Team member replacement     1
2               Teenagers     1
3          Term weighting     1
4                    text     1
5      Text Normalization     1
6                 Twitter     1
7         User Interfaces     1

or

text_df <- read_csv("C:/Users/acel/Desktop/myphr.csv")
count(text_df, text, sort = TRUE)

If you need perform transformations on the text look at the stringr and tidytext packages.