Сравните пакет слов в двух документах и найдите подходящее слово и их частоту во втором документе.
Я вычислил пакет слов для 'yelp.csv', 'yelpp.csv', 'yelpn.csv' и создал матрицу частоты слов в наборе данных отдельных лиц. Теперь я хочу сравнить пакет слов yelp с yelpn и проверить, сколько слов в yelp появляется в yelpn и их частоту, и сохранить его в переменной в виде матрицы, затем то же самое для yelpp. Визг содержит как положительный, так и отрицательный. yelpp, только положительный и yelpn, только отрицательный. кто-нибудь может завершить код? я не знаю, уместен ли этот код, я надеюсь на это.
getwd()
setwd("/Users/ash/RProjects/exc")
getwd()
df <- read.csv("yelp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
strip.white = TRUE)
df
dfd<-as.character(df[,2])
dfd
df2<-as.character(df[,1])
df2
words <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
s<-remove_stopwords(dfd, words, lines = TRUE)
s
print(paste("****Stopwords are removed successfully****"))
n<-removeNumbers(s)
n
t<-removePunctuation(n, preserve_intra_word_dashes = FALSE)
t
#pos
dfp <- read.csv("yelpp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
strip.white = TRUE)
dfp
dfdp<-as.character(dfp[,2])
dfdp
df2p<-as.character(dfp[,1])
df2p
wordsp <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
sp<-remove_stopwords(dfdp, words, lines = TRUE)
sp
print(paste("****Stopwords are removed successfully****"))
np<-removeNumbers(sp)
np
tp<-removePunctuation(np, preserve_intra_word_dashes = FALSE)
tp
#neg
dfn <- read.csv("yelpn.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
strip.white = TRUE)
dfn
dfdn<-as.character(dfn[,2])
dfdn
df2n<-as.character(dfn[,1])
df2n
wordsn <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
sn<-remove_stopwords(dfdn, words, lines = TRUE)
sn
print(paste("****Stopwords are removed successfully****"))
nn<-removeNumbers(sn)
nn
tn<-removePunctuation(nn, preserve_intra_word_dashes = FALSE)
tn
#bag
b<-bag_o_words(t, apostrophe.remove = TRUE)
b
b.mat = as.matrix(b)
b.mat
bp<-bag_o_words(tp, apostrophe.remove = TRUE)
bp
bp.mat = as.matrix(bp)
bp.mat
bn<-bag_o_words(tn, apostrophe.remove = TRUE)
bn
bn.mat = as.matrix(bn)
bn.mat
#frequent terms
frequent_terms <- freq_terms(b.mat, 2000)
frequent_terms
frequent_termsp <- freq_terms(tp, 2000)
frequent_termsp
frequent_termsn <- freq_terms(tn, 2000)
frequent_termsn
1 ответ
Я беру текст, например, корпусов из вики Text mining. С помощью tm
пакет и findFreqTerms
,agrep
функции являются основными пунктами в этом подходе.
agrep
Поиск приближенных совпадений с шаблоном (первый аргумент) в каждом элементе строки x (второй аргумент) с использованием обобщенного расстояния редактирования Левенштейна (минимально возможное взвешенное количество вставок, удалений и замен, необходимых для преобразования одной строки в другую).
Шаги подхода:
тексты -> корпусы -> очистка данных -> findfreqterms -> сравнить с другими терминами матрица документов
library(tm)
c1 <- Corpus(VectorSource("Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning"))
c2 <- Corpus(VectorSource("Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output"))
c3 <- Corpus(VectorSource("Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities)"))
# Data Cleaning and transformation
c1 <- tm_map(c1, content_transformer(tolower))
c2 <- tm_map(c2, content_transformer(tolower))
c3 <- tm_map(c3, content_transformer(tolower))
c1 <- tm_map(c1, removePunctuation)
c1 <- tm_map(c1, removeNumbers)
c1 <- tm_map(c1, removeWords, stopwords("english"))
c1 <- tm_map(c1, stripWhitespace)
c2 <- tm_map(c2, removePunctuation)
c2 <- tm_map(c2, removeNumbers)
c2 <- tm_map(c2, removeWords, stopwords("english"))
c2 <- tm_map(c2, stripWhitespace)
c3 <- tm_map(c3, removePunctuation)
c3 <- tm_map(c3, removeNumbers)
c3 <- tm_map(c3, removeWords, stopwords("english"))
c3 <- tm_map(c3, stripWhitespace)
dtm1 <- DocumentTermMatrix(c1, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm2 <- DocumentTermMatrix(c2, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm3 <- DocumentTermMatrix(c3, control = list(weighting = weightTfIdf, stopwords = TRUE))
ft1 <- findFreqTerms(dtm1)
ft2 <- findFreqTerms(dtm2)
ft3 <- findFreqTerms(dtm3)
#similarity between c1 and c2
common.c1c2 <- data.frame(term = character(0), freq = integer(0))
for(t in ft1){
find <- agrep(t, ft2)
if(length(find) != 0){
common.c1c2 <- rbind(common.c1c2, data.frame(term = t, freq = length(find)))
}
}
# Note : this for loop can be substituted by apply family functions if taking time for large text
common.c1c2
содержит общие слова между corpus1 и corpus2 с частотой
> common.c1c2
term freq
1 also 1
2 data 2
3 derived 1
4 deriving 1
5 mining 1
6 pattern 1
7 patterns 1
8 process 1
9 text 1
> ft1
[1] "also" "analytics" "data" "derived" "deriving" "devising" "equivalent"
[8] "highquality" "information" "learning" "means" "mining" "pattern" "patterns"
[15] "process" "referred" "roughly" "statistical" "text" "trends" "typically"
> ft2
[1] "addition" "along" "data" "database" "derived" "deriving"
[7] "evaluation" "features" "finally" "input" "insertion" "interpretation"
[13] "involves" "linguistic" "mining" "others" "output" "parsing"
[19] "patterns" "process" "removal" "structured" "structuring" "subsequent"
[25] "text" "usually" "within"
Это решение не самое эффективное, но надеюсь, что оно поможет.