forked from bquast/Data-Science-Capstone
-
Notifications
You must be signed in to change notification settings - Fork 0
/
construct_frequency_table.R
99 lines (81 loc) · 2.45 KB
/
construct_frequency_table.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# construct_frequency_table.R
# Bastiaan Quast
# load the libraries
library(tm)
library(RWeka)
library(dplyr)
library(magrittr)
# load the sample data
load("sample_data.RData")
# ngram tokaniser
n <- 2L
bigram_token <- function(x) NGramTokenizer(x, Weka_control(min = n, max = n))
n <- 3L
trigram_token <- function(x) NGramTokenizer(x, Weka_control(min = n, max = n))
# check length function
length_is <- function(n) function(x) length(x)==n
# contruct single corpus from sample data
vc_blogs <-
sample_blogs %>%
data.frame() %>%
DataframeSource() %>%
VCorpus %>%
tm_map( stripWhitespace )
vc_news <-
sample_news %>%
data.frame() %>%
DataframeSource() %>%
VCorpus %>%
tm_map( stripWhitespace )
vc_twitter <-
sample_twitter %>%
data.frame() %>%
DataframeSource() %>%
VCorpus %>%
tm_map( stripWhitespace )
vc_all <- c(vc_blogs, vc_news, vc_twitter)
# frequency unigrams
tdm_unigram <-
vc_all %>%
TermDocumentMatrix( control = list( removePunctuation = TRUE,
removeNumbers = TRUE,
wordLengths = c( 1, Inf) )
)
freq_unigram <-
tdm_unigram %>%
as.matrix %>%
rowSums
# write all unigrams to a list
# in order to create uniform levels of factors
unigram_levels <- unique(tdm_unigram$dimnames$Terms)
# trigram Term-Document Matrix
tdm_trigram <-
vc_all %>%
TermDocumentMatrix( control = list( removePunctuation = TRUE,
removeNumbers = TRUE,
wordLengths = c( 1, Inf),
tokenize = trigram_token)
)
# aggregate frequencies
tdm_trigram %>%
as.matrix %>%
rowSums -> freq_trigram
# repeat by frequency
freq_trigram %<>%
names %>%
rep( times = freq_trigram )
# split the trigram into three columns
freq_trigram %<>%
strsplit(split=" ")
# filter out those of less than three columns
freq_trigram <- do.call(rbind,
Filter( length_is(3),
freq_trigram )
)
# transform to data.frame encode as factors
df_trigram <- data.frame(X1 = factor(freq_trigram[,1], levels = unigram_levels),
X2 = factor(freq_trigram[,2], levels = unigram_levels),
Y = factor(freq_trigram[,3], levels = unigram_levels) )
# save data frame
save( df_trigram, unigram_levels, file = "df_trigram.RData")