-
Notifications
You must be signed in to change notification settings - Fork 1
/
first_analysis.R
185 lines (159 loc) · 9.12 KB
/
first_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#rm(list = ls()) #remove all the existing environment vars
wdir <- ""
dataDir <- "data/"
packagesFile<-"pack2.txt"
source(paste(wdir, "functions.R", sep="")) ### this also loads every needed package
#loadDatasets(paste(wdir,dataFile,sep="")) ###USE THIS IF YOU CURRENTLY HAVEN'T DATASETS IN WORKSPACE
"Basic Statistics and plots..."
data<-aida
par(mfrow=c(2,2))
tb.region <- sort(table(data$Region))
tb.region
barplot(tb.region,xlab = "Region")
barplot(tb.region[(length(tb.region)-2):length(tb.region)],xlab = "Top Region")
#provinces distribution
tb.province <- sort(table(data$Province))
barplot(tb.province,xlab = "Province")
barplot(tb.province[(length(tb.province)-2):length(tb.province)],xlab = "Top Province")
par(mfrow=c(1,1))
#The regions with largest numbers of firms are from North(Lombardia, VENETO, Emilia Romagna)
#Instead the most common provinces are MILANO, VICENZA, BRESCIA
#geo area distibution
tb.geo = table(data$GeoArea) # SUD: 2000509 (23.8%), CENTRO: 2097369(25%), NORD(51.2%):
#tb.geo = tb.geo[order(tb.geo)]
#tb.geo <- scale(tb.geo, FALSE, colSums(tb.geo)) * 100
barplot(tb.geo,xlab="Geo Area" )
ggplot(data, aes(x=GeoArea, fill=GeoArea)) +
geom_bar(aes(y=prop.table((..count..)/sum(..count..))), position = "dodge") +
ggtitle("Firms distribution by Geographic Area") +
xlab("Geographic Areas") + ylab ("Firms percentage") +
geom_text(aes(y = prop.table((..count..)/sum(..count..)) / 2,
label = paste0(round(prop.table(..count..) * 100, 1), '%')),
stat = 'count', position = position_dodge(.9), size = 3) +
scale_x_discrete(limits=c("Sud", "Nord", "Centro")) +
scale_fill_manual(values=wes_palette(n=3, name="Darjeeling1")) +
guides(fill=F) + theme(plot.title = element_text(hjust = .5))
by.geo = split(data,data$GeoArea)
plotByGeo = function(x){ #PLOTS PROVINCES AND REGIONS FOR EACH GEOAREA
par(mfrow=c(1,1))
nomi <- c(names(x))
k<-1
for(i in x){
#distribution by province
tb.province <- sort(table(i$Province))
title=paste("",nomi[k], sep=" ")
barplot(tb.province[106:110],xlab = "Top Province" ,main =title )
#distribution by region
tb.region = table(i$Region)
tb.region = tb.region[order(tb.region)]
title=paste("",nomi[k], sep=" ")
barplot(tb.region[15:20],xlab = "Top Region ",main = title)
k=k+1
}
par(mfrow=c(1,1))
}
plotByGeo(by.geo)
# DISTRIBUTION BY SIZE
{
tb.size<- sort(table(data$Size)) # LARGE: 33681(0.4%), MEDIUM: 1061477(12.64%), SMALL: 86.96%
'ggplot(data, aes(x=Size, fill=Size)) +
geom_bar(aes(y=prop.table((..count..)/sum(..count..))), position = "dodge") +
ggtitle("Firms distribution by Size") +
xlab("Firm Sizes") + ylab ("Firms percentage") +
geom_text(aes(y = prop.table((..count..)/sum(..count..)) / 2,
label = paste0(round(prop.table(..count..) * 100, 1), "%")),
stat = "count", position = position_dodge(.9), size = 3) +
scale_fill_manual(values=wes_palette(n=3, name="Darjeeling1")) +
guides(fill=F) + theme(plot.title = element_text(hjust = .5))
'
#plot with exp scale (exp=0.4, very similar to sqrt!)
ggplot(data, aes(x=Size, fill=Size)) +
geom_bar(aes(y=((prop.table((..count..)/sum(..count..)))*1000)^.4), position = "dodge") +
ggtitle("Firms distribution by Size") +
xlab("Firm Sizes") + ylab ("Firms percentage") +
geom_text(aes(y = (prop.table((..count..)/sum(..count..)) *1000)^.4 / 2,
label = paste0(round(prop.table(..count..) * 100, 1), '%')),
stat = 'count', position = position_dodge(.9), size = 3) +
scale_fill_manual(values=wes_palette(n=3, name="Darjeeling1")) +
scale_y_continuous(labels = trans_format(function(y) y^2.5/1000, format = percent)) +
guides(fill=F) + theme(plot.title = element_text(hjust = .5))
}
### SIZE BY GEOAREA ###
{
countBySud<-table(data$Size[data$GeoArea=='Sud'])
countByCentro<-table(data$Size[data$GeoArea=='Centro'])
countByNord<-table(data$Size[data$GeoArea=='Nord'])
myFreq<-rbind(countBySud/sum(countBySud),countByCentro/sum(countByCentro), countByNord/sum(countByNord))
tmpDf<-data.frame(cbind("GeoArea"=c("Sud", "Centro", "Nord"),myFreq))
# melt the data frame for plotting
melted <- melt(tmpDf, id.vars='GeoArea')
melted$value <- as.double(melted$value)
# ok, the percentages distribution of Small, Medium and Large are very similar within each GeoArea
ggplot(melted, aes(GeoArea, value)) +
geom_bar(aes(fill = variable), position = "dodge", stat="identity")
}
### GEOAREA BY SIZE ###
{
countBySmall<-table(data$GeoArea[data$Size=='Small'])
countByMedium<-table(data$GeoArea[data$Size=='Medium'])
countByLarge<-table(data$GeoArea[data$Size=='Large'])
myFreq<-rbind(countBySmall/sum(countBySmall),countByMedium/sum(countByMedium), countByLarge/sum(countByLarge))
tmpDf<-data.frame(cbind("Size"=c("Small", "Medium", "Large"),myFreq))
# melt the data frame for plotting
melted <- melt(tmpDf, id.vars='Size')
melted$value <- as.double(melted$value)
# Large firms are much more "common" in Nord with respect to Centro and Sud
ggplot(melted, aes(Size, value)) +
geom_bar(aes(fill = variable), position = "dodge", stat="identity") +
ggtitle("Geographic Area distribution by Size") +
xlab("Firm Sizes") + ylab ("Firms Percentage") +
geom_text(aes(fill=variable, label = paste0(round(100*value, 1),"%"), y=value/2),size = 3 , position = position_dodge(.9)) +
scale_fill_manual(values=wes_palette(n=3, name="Darjeeling1")) +
theme(plot.title = element_text(hjust = .5))
}
remove(melted,tmpDf,myFreq,data, by.geo)
"Useful subsets regarding firms which have missing values for Employee column"
{
if(!exists("aidat"))
load(paste0(wdir,dataDir,"aidat.R"))
manufacturing <- subset(aidat,Ateco>=101100 & Ateco<=332009 & Year>2006 & Year<2016) #subset containing all the manufacturing firms from original dataset
manufacturingNoMiss <- subset(manufacturing, R>=0 & E>=0)
#remove(aidat)
#unId<-unique(manufacturing$TaxID) #vector containing all the unique taxIDs from manufacturing firms
onlyMissEmployees <- subset(manufacturing,is.na(manufacturing$E)) #subset of manufacturing firms having at least one "NA" as Employee
missingEmpID <- unique(onlyMissEmployees$TaxID) #vector of unique taxIDs having at least one "NA" as Employee
#tmp <- group_by(onlyMissEmployees,TaxID)
#distinctEmpByID <- summarize(tmp,countE=n()) #grouping by TaxId and counting the number of NA for each TaxId
#distinctEmpByID <- arrange(distinctEmpByID, countE)
firmsMissingEmpRows <- dplyr::filter(manufacturing, manufacturing$TaxID %in% missingEmpID) #subset of rows containing only the firms having at least one "NA" as Employee
#missingEmpSortByTaxID <- arrange(firmsMissingEmpRows,TaxID,E,Year) #sorting the last subset by TaxID, N°Employee and Year
#tmp <- group_by(missingEmpSortByTaxId,TaxID)
#distinctEmpByID2 <- summarize(tmp,countE=n()) #grouping by TaxId and counting the number of rows for each TaxId
#distinctEmpByID2 <- arrange(distinctEmpByID2, countE)
"Counting the Employee missing values for each firm"
countMissingByTaxID <- dplyr::filter(onlyMissEmployees, is.na(onlyMissEmployees$E)) %>%
group_by(taxID = onlyMissEmployees$TaxID) %>%
summarise(numMissing = n())
countMissingByTaxID$numYears <- count(firmsMissingEmpRows, firmsMissingEmpRows$TaxID)$n # associating to each firm(taxID) the number of missing values and the number of years the firm appears in the whole dataset
countMissingByTaxID$percMissing <- countMissingByTaxID$numMissing / countMissingByTaxID$numYears #associating the percent of missing values for Employee to each taxID
countMissingByTaxID <- arrange(countMissingByTaxID, percMissing, numYears)
tmpC<-group_by(countMissingByTaxID, percMissing)
tmpC<-count(tmpC, percMissing)
plot(tmpC$percMissing,tmpC$n,type = "l", xlab = "percMissing", ylab = "count")
#### ROWS AND TAX IDs OF FIRMS HAVING AT LEAST 1 NA AS REVENUE ####
onlyMissRevenues <- subset(manufacturing,is.na(manufacturing$R)) #subset of manufacturing firms having at least one "NA" as Employee
missingRevID <- unique(onlyMissRevenues$TaxID) #vector of unique taxIDs having at least one "NA" as Employee
firmsMissingRevRows <- dplyr::filter(manufacturing, manufacturing$TaxID %in% missingRevID) #subset containing only the firms having at least one "NA" as Employee
#### SELECTING ALL THE ROWS OF FIRMS HAVING ALL THE ENTRIES FOR YEARS 2006-2015 ####
nonMissingRevenues<-dplyr::filter(manufacturing,! is.na(R)) #removing all the rows having missing values for Revenue
nine_years_TaxID<-group_by(nonMissingRevenues,TaxID) %>%
count()
nine_years_TaxID<-dplyr::filter(nine_years_TaxID,n==9)
nine_year_firms<-dplyr::filter(nonMissingRevenues, nonMissingRevenues$TaxID %in% nine_years_TaxID$TaxID) %>%
arrange(TaxID,Year) #subset containing only the firms having entries for each year(between 2007 and 2016) and Revenue NOT missing
nine_years_TaxID2<-group_by(manufacturingNoMiss,TaxID) %>%
count()
nine_years_TaxID2<-dplyr::filter(nine_years_TaxID2,n==9)
nine_year_firms2<-dplyr::filter(manufacturingNoMiss, manufacturingNoMiss$TaxID %in% nine_years_TaxID2$TaxID) %>%
arrange(TaxID,Year) #subset containing only the firms having entries for each year(between 2007 and 2016) with Revenue and Employee NOT missing
}