-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze_topics.R
164 lines (144 loc) · 5.66 KB
/
analyze_topics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
library(tm)
library(topicmodels)
lda.trump <- readRDS("t10.rds")
lda.clinton <- readRDS("c10.rds")
clinton.topics <- as.matrix(topics(lda.clinton))
clinton.terms <- as.matrix(terms(lda.clinton,20))
trump.topics <- as.matrix(topics(lda.trump))
trump.terms <- as.matrix(terms(lda.trump,20))
int.hist = function(x,ylab="Frequency",...) {
barplot(table(factor(x,levels=min(x):max(x))),space=0,xaxt="n",ylab=ylab,...);axis(1)
}
int.hist(clinton.topics)
int.hist(trump.topics)
## Clinton topic evolutions
library(tm)
library(topicmodels)
fb_data <- readRDS('fb_data.rds')
months <- (unlist(fb_data[[4]]))
dtm <- readRDS("dtm.rds")
idx<-unlist(lapply(which(grepl('clinton', dtm$dimnames$Terms)), function(x) which(dtm$j %in% x)))
cidx <- sort(unique(dtm$i[idx]))
months <- months[cidx]
posts<-fb_data[[1]]
p_id <- array(NA, length(unlist(posts)))
p_id[1:length(posts[[1]])] <- 1
last_id <- length(posts[[1]])
for (i in 2:10) {
p_id[(last_id+1):(last_id+length(posts[[i]]))] <- i;
last_id <- last_id+length(posts[[i]]);
}
p_id <- p_id[cidx]
c_topics <- array(NA, c(11, 10))
for (m in 1:11)
for (t in 1:10)
c_topics[m,t] <- sum(clinton.topics[months==m]==t)/length(clinton.topics[months==m]);
library(ggplot2)
library(reshape2)
library(scales)
library(ggthemes)
ct <- data.frame(t(c_topics))
rownames(ct) <- sprintf("Topic %2d", 1:10)
colnames(ct) <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul","Aug","Sep","Oct","Nov")
datm <- melt(cbind(ct, Topics = rownames(ct)), id.vars = c('Topics'))
png ("c_topic_evol.png", height=5, width=7, units = 'in', res = 200)
ggplot(datm,aes(x = variable, y = value, fill=Topics)) +
geom_bar(position = "fill",stat = "identity") +
scale_y_continuous(labels = percent_format()) +
scale_fill_manual("Clinton",
values = c("forestgreen", "orange", "darkblue","red",
"gold4", "mediumpurple4","cyan4", "darkorange3",
"blue","firebrick4")) +
labs(x="",y="Coverage percentage") +
theme_set(theme_gray(base_size = 12))
dev.off()
c_topics <- array(NA, c(10, 10))
for (m in 1:10)
for (t in 1:10)
c_topics[m,t] <- sum(clinton.topics[p_id==m]==t)/length(clinton.topics[p_id==m]);
library(ggplot2)
library(reshape2)
library(scales)
library(ggthemes)
ct <- data.frame(t(c_topics))
rownames(ct) <- sprintf("Topic %2d", 1:10)
colnames(ct) <- c("ABC", "BBC","CBS", "CNN", "Fox", "NBC",
"NPR", "NYT", "WP", "WSJ")
datm <- melt(cbind(ct, Topics = rownames(ct)), id.vars = c('Topics'))
png ("c_topic_news.png", height=5, width=7, units = 'in', res = 200)
ggplot(datm,aes(x = variable, y = value, fill=Topics)) +
geom_bar(position = "fill",stat = "identity") +
scale_y_continuous(labels = percent_format()) +
scale_fill_manual("Clinton",
values = c("forestgreen", "orange", "darkblue","red",
"gold4", "mediumpurple4","cyan4", "darkorange3",
"blue","firebrick4")) +
labs(x="",y="Coverage percentage") +
theme_set(theme_gray(base_size = 12))
dev.off()
## Trump topic evolutions
library(tm)
library(topicmodels)
fb_data <- readRDS('fb_data.rds')
months <- (unlist(fb_data[[4]]))
dtm <- readRDS("dtm.rds")
idx<-unlist(lapply(which(grepl('trump', dtm$dimnames$Terms)), function(x) which(dtm$j %in% x)))
tidx <- sort(unique(dtm$i[idx]))
months <- months[tidx]
dtm.trump <- dtm[ sort(unique(dtm$i[idx])),]
dtm.trump <- removeSparseTerms(dtm.trump,0.995)
ui = unique(dtm.trump$i)
months <- months[ui]
posts<-fb_data[[1]]
p_id <- array(NA, length(unlist(posts)))
p_id[1:length(posts[[1]])] <- 1
last_id <- length(posts[[1]])
for (i in 2:10) {
p_id[(last_id+1):(last_id+length(posts[[i]]))] <- i;
last_id <- last_id+length(posts[[i]]);
}
p_id <- p_id[tidx]
p_id <- p_id[ui]
t_topics <- array(NA, c(11, 10))
for (m in 1:11)
for (t in 1:10)
t_topics[m,t] <- sum(trump.topics[months==m]==t)/length(trump.topics[months==m]);
library(reshape2)
library(scales)
library(ggthemes)
ct <- data.frame(t(t_topics))
rownames(ct) <- sprintf("Topic %2d", 1:10)
colnames(ct) <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul","Aug","Sep","Oct","Nov")
datm <- melt(cbind(ct, Topics = rownames(ct)), id.vars = c('Topics'))
png ("t_topic_evol.png", height=5, width=7, units = 'in', res = 200)
ggplot(datm,aes(x = variable, y = value, fill=Topics)) +
geom_bar(position = "fill",stat = "identity") +
scale_y_continuous(labels = percent_format()) +
scale_fill_manual("Trump", values = c("forestgreen", "orange", "darkblue","red", "gold4", "mediumpurple4","cyan4", "darkorange3", "blue","firebrick4")) +
labs(x="",y="Coverage percentage") +
theme_set(theme_gray(base_size = 18))
dev.off()
t_topics <- array(NA, c(10, 10))
for (m in 1:10)
for (t in 1:10)
t_topics[m,t] <- sum(trump.topics[p_id==m]==t)/length(trump.topics[p_id==m]);
library(ggplot2)
library(reshape2)
library(scales)
library(ggthemes)
ct <- data.frame(t(t_topics))
rownames(ct) <- sprintf("Topic %2d", 1:10)
colnames(ct) <- c("ABC", "BBC","CBS", "CNN", "Fox", "NBC",
"NPR", "NYT", "WP", "WSJ")
datm <- melt(cbind(ct, Topics = rownames(ct)), id.vars = c('Topics'))
png ("t_topic_news.png", height=5, width=7, units = 'in', res = 200)
ggplot(datm,aes(x = variable, y = value, fill=Topics)) +
geom_bar(position = "fill",stat = "identity") +
scale_y_continuous(labels = percent_format()) +
scale_fill_manual("Trump",
values = c("forestgreen", "orange", "darkblue","red",
"gold4", "mediumpurple4","cyan4", "darkorange3",
"blue","firebrick4")) +
labs(x="",y="Coverage percentage") +
theme_set(theme_gray(base_size = 38))
dev.off()