Today I will not tell you which graphical parmaters you can use, but
you have to do your best to make your plots nice! Notice… you can change
legend name using +labs()
athlete_events.csv
from Datasets
folder,
assign it to a variable ath
and explore it (you can also
use the link above for the description of columns). How many lines does
the dataset contain? How many unique people are included?ath=read.csv("Datasets/athlete_events.csv")
dim(ath)
## [1] 271116 15
length(unique(ath$Name))
## [1] 134732
Second step: let’s make some figures that describe the dataset.
Remember to assign each plot to a variable PlotN
(Plot1
, Plot2
, …). In this way you will be
able to insert it in the structure later on. Make: (Noticing that each
time you have to make some transformations to the dataset to keep only
the unique()
version of the columns you need. In fact, for
example, a same person can partecipate to more than one game type, but
if you are interested in people that partecipated to olympic games
during a year you would not to count twice that person)
Plot1
representing the unique number of male
and female athletes that partecipated each year to olympic games. Make
it with percentages. Hint: remember to transform year from continous to
categoricalPlot2
representing ages of athletes at
each NOC divided by male and females. Also identify the youngest and the
oldest athetes for both males and females and add their names on plot in
the right place (if more than one per sex and min/max exist choose only
the first to plot). Notice that many values of age are missing. Hint:
for the annotation use F
and M
for
x
positionsPlot3
) of Height and Weigth
of all athletes in all NOC Hint: explore https://r-graph-gallery.com/2d-density-plot-with-ggplot2.html
for some ideaslibrary(ggplot2)
ol=unique(ath[, c("Name", "Sex", "Year")])
Plot1=ggplot(ol, aes(x=as.character(Year), fill=Sex))+geom_bar(position="fill")+scale_fill_manual(values=c("M"="cornflowerblue", "F"="magenta"))+theme_bw()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+xlab("year")+ylab("% males and females")
ag=unique(ath[, c("Name", "Sex","NOC", "Age")])
f=subset(ag, Sex=="F")
min_f=subset(f, Age==min(f$Age, na.rm = T))$Name[1]
max_f=subset(f, Age==max(f$Age, na.rm = T))$Name[1]
max_f="E.L. Ernesta Robert-Mrignac"
m=subset(ag, Sex=="M")
min_m=subset(m, Age==min(m$Age, na.rm = T))$Name[1]
max_m=subset(m, Age==max(m$Age, na.rm = T))$Name[1]
Plot2=ggplot(ag, aes(x=Sex,y=Age, fill=Sex))+geom_violin()+scale_fill_manual(values=c("M"="cornflowerblue", "F"="magenta"))+theme_bw()+xlab("Althete sex")+ylab("Years old")+
annotate(geom="text", label=min_f, x="F", y=11)+
annotate(geom="text", label=max_f, x="F", y=74)+
annotate(geom="text", label=min_m, x="M", y=10)+
annotate(geom="text", label=max_m, x="M", y=97)
aw=unique(ath[, c("Name", "Sex","NOC", "Height", "Weight")])
Plot3=ggplot(aw, aes(x=Height,y=Weight))+stat_density_2d(aes(fill = ..level..), geom = "polygon", colour="white")+theme_bw()+scale_fill_gradient(low="darkslateblue", high="deepskyblue")+labs(fill="Level")
table()
result of Sex and Metal. Then,
create a table plot using this data and assign it to a variable
Table
.library(ggpubr)
ta=as.data.frame(table(ath$Sex, ath$Medal))
colnames(ta)=c("Sex", "Medal", "Number of medals")
Table=ggtexttable(ta, rows = NULL,
theme = ttheme(base_style ="classic", padding = unit(c(15, 4), "mm")))
Plot4
using data of the table you
created. Create a dodged plot indicating as x
Medals,
y
the Number of medals and fill
Sex.Plot4=ggplot(ta, aes(x=Medal,y=`Number of medals`, fill=Sex))+geom_bar(stat="identity", position = "dodge", color="black")+scale_fill_manual(values=c("M"="cornflowerblue", "F"="magenta"))+theme_bw()
ath
keeping only Gold medals. Then:
Range_of_time
in which you label years
according to the following steps: <=1930, >1930 & <=1972,
>1972 & <=1999 and >1999 (use values “early 90s”, “middle
90s”, “late 90s” and “00s”)Range_of_time
values into factors to
make an ordered plotgold=subset(ath, Medal=="Gold")
gold$Range_of_time<-"early 90s"
gold$Range_of_time[which(gold$Year>1930 & gold$Year<=1972)]="middle 90s"
gold$Range_of_time[which(gold$Year>1972 & gold$Year<=1999)]="late 90s"
gold$Range_of_time[which(gold$Year>1999)]="00s"
gold_f=subset(gold, Sex=="F")
gold_f=as.data.frame(table(as.character(gold_f$Range_of_time)))
gold_f$Var1=factor(gold_f$Var1, levels=c("00s","late 90s","middle 90s","early 90s"))
gold_m=subset(gold, Sex=="M")
gold_m=as.data.frame(table(as.character(gold_m$Range_of_time)))
gold_m$Var1=factor(gold_m$Var1, levels=c("00s","late 90s","middle 90s","early 90s"))
Plot5=ggplot(gold_f, aes(x="",y=Freq, fill=Var1))+geom_bar(color="black",position="fill", stat="identity")+coord_polar("y")+theme_bw()+xlab(NULL)+ylab(NULL)+labs(fill="Range of time")+theme(axis.ticks = element_blank())+
scale_fill_manual(values=c("00s"="steelblue1","late 90s"="deeppink1","middle 90s"="gold","early 90s"="seagreen2"))+
ggtitle("Gold medals females")
Plot6=ggplot(gold_m, aes(x="",y=Freq, fill=Var1))+geom_bar(color="black",position="fill", stat="identity")+coord_polar("y")+theme_bw()+xlab(NULL)+ylab(NULL)+labs(fill="Range of time")+theme(axis.ticks = element_blank())+
scale_fill_manual(values=c("00s"="steelblue1","late 90s"="deeppink1","middle 90s"="gold","early 90s"="seagreen2"))+
ggtitle("Gold medals males")
ggarrange()
functions as each line has specific numbers of
plots and different column widths. Let’s build a block at time:
ggarrange()
to combine Plot1
and
Plot2
in one row. Put labels “A” and “B”. Assign this
figure to a variable a
.ggarrange()
to combine Plot3
,
Table
and Plot4
in one row. Fix
widths=c(1, 1.5,1)
. Put labels “C”, “D” and “E”. Assign
this figure to a variable b
.ggarrange()
to combine Plot5
and
Plot6
in one row. Put labels “F” and “G”. Assign this
figure to a variable c
.a
, b
and
c
and assign this figure to a variable
plot
.annotate_figure()
for adding annotations on left
and bottom sides. Remember to fix “bold” as face and to rotate
annotation on the leftpdf()
, choosing width and height (I
personally used
width=unit(10, "cm"), height = unit(9, "cm")
).library(grid)
a=ggarrange(plotlist = list(Plot1, Plot2), nrow = 1, common.legend = F, legend ="none", labels = c("A", "B"), vjust = 1.1)
b=ggarrange(plotlist = list(Plot3, Table, Plot4), nrow = 1,widths = c(1,1.5, 1), legend = "right", labels = c("C", "D", "E"), vjust = 1.1)
c=ggarrange(plotlist = list(Plot5, Plot6), nrow = 1, common.legend = T, legend = "right", labels = c("F", "G"), vjust = 1.1)
plot=ggarrange(plotlist = list(a,b,c),nrow=3 ,common.legend = T)
plot2=annotate_figure(plot,
bottom = text_grob("Figure n. 1", face = "bold", size = 14),
left = text_grob("Exploration of data on athletes across the olympic history", rot = 90, face = "bold", size = 14))
plot2
#pdf("~/Downloads/prova.pdf", useDingbats = F, width=unit(10, "cm"), height = unit(9, "cm"))
#plot2
#dev.off()