::opts_chunk$set(cache=FALSE)
knitr::opts_chunk$set(fig.width=10) # for rstudio
knitr#knitr::opts_chunk$set(fig.width=6, fig.height=6) # for rstudio
#knitr::opts_chunk$set(fig.width=14, fig.height=14) # for html
options(width = 100)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.5 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr) # syntaxe, notamment affectation %<>%
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(GGally) # plot pairs better than default plot
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(plotly) # plots interactifs
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Lecture des données.
read_csv
, read_tsv
, read_delim
, …
read_csv('wine.data.txt')
##
## ── Column specification ────────────────────────────────────────────────────────────────────────────
## cols(
## cultivar = col_double(),
## alcohol = col_double(),
## `malic-acid` = col_double(),
## ash = col_double(),
## `alcalinity-of-ash` = col_double(),
## magnesium = col_double(),
## `total-phenols` = col_double(),
## flavonoids = col_double(),
## `nonflavonoid-phenols` = col_double(),
## proanthocyanins = col_double(),
## `color-intensity` = col_double(),
## hue = col_double(),
## `od280-od315-of-diluted-wines` = col_double(),
## proline = col_double()
## )
## # A tibble: 178 x 14
## cultivar alcohol `malic-acid` ash `alcalinity-of-… magnesium `total-phenols` flavonoids
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 14.2 1.71 2.43 15.6 127 2.8 3.06
## 2 1 13.2 1.78 2.14 11.2 100 2.65 2.76
## 3 1 13.2 2.36 2.67 18.6 101 2.8 3.24
## 4 1 14.4 1.95 2.5 16.8 113 3.85 3.49
## 5 1 13.2 2.59 2.87 21 118 2.8 2.69
## 6 1 14.2 1.76 2.45 15.2 112 3.27 3.39
## 7 1 14.4 1.87 2.45 14.6 96 2.5 2.52
## 8 1 14.1 2.15 2.61 17.6 121 2.6 2.51
## 9 1 14.8 1.64 2.17 14 97 2.8 2.98
## 10 1 13.9 1.35 2.27 16 98 2.98 3.15
## # … with 168 more rows, and 6 more variables: `nonflavonoid-phenols` <dbl>, proanthocyanins <dbl>,
## # `color-intensity` <dbl>, hue <dbl>, `od280-od315-of-diluted-wines` <dbl>, proline <dbl>
La 1ère colonne est interpétée comme un nombre (dbl) et non un facteur (fct) car constituée de 1, 2 ou 3 ; les 3 classes possibles.
Lecture en spécifiant les types des colonnes
= read_csv('wine.data.txt', col_types="fddddddddddddd")
tb tb
## # A tibble: 178 x 14
## cultivar alcohol `malic-acid` ash `alcalinity-of-… magnesium `total-phenols` flavonoids
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 14.2 1.71 2.43 15.6 127 2.8 3.06
## 2 1 13.2 1.78 2.14 11.2 100 2.65 2.76
## 3 1 13.2 2.36 2.67 18.6 101 2.8 3.24
## 4 1 14.4 1.95 2.5 16.8 113 3.85 3.49
## 5 1 13.2 2.59 2.87 21 118 2.8 2.69
## 6 1 14.2 1.76 2.45 15.2 112 3.27 3.39
## 7 1 14.4 1.87 2.45 14.6 96 2.5 2.52
## 8 1 14.1 2.15 2.61 17.6 121 2.6 2.51
## 9 1 14.8 1.64 2.17 14 97 2.8 2.98
## 10 1 13.9 1.35 2.27 16 98 2.98 3.15
## # … with 168 more rows, and 6 more variables: `nonflavonoid-phenols` <dbl>, proanthocyanins <dbl>,
## # `color-intensity` <dbl>, hue <dbl>, `od280-od315-of-diluted-wines` <dbl>, proline <dbl>
Contenu
%>% summary tb
## cultivar alcohol malic-acid ash alcalinity-of-ash magnesium
## 1:59 Min. :11.03 Min. :0.740 Min. :1.360 Min. :10.60 Min. : 70.00
## 2:71 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210 1st Qu.:17.20 1st Qu.: 88.00
## 3:48 Median :13.05 Median :1.865 Median :2.360 Median :19.50 Median : 98.00
## Mean :13.00 Mean :2.336 Mean :2.367 Mean :19.49 Mean : 99.74
## 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558 3rd Qu.:21.50 3rd Qu.:107.00
## Max. :14.83 Max. :5.800 Max. :3.230 Max. :30.00 Max. :162.00
## total-phenols flavonoids nonflavonoid-phenols proanthocyanins color-intensity
## Min. :0.980 Min. :0.340 Min. :0.1300 Min. :0.410 Min. : 1.280
## 1st Qu.:1.742 1st Qu.:1.205 1st Qu.:0.2700 1st Qu.:1.250 1st Qu.: 3.220
## Median :2.355 Median :2.135 Median :0.3400 Median :1.555 Median : 4.690
## Mean :2.295 Mean :2.029 Mean :0.3619 Mean :1.591 Mean : 5.058
## 3rd Qu.:2.800 3rd Qu.:2.875 3rd Qu.:0.4375 3rd Qu.:1.950 3rd Qu.: 6.200
## Max. :3.880 Max. :5.080 Max. :0.6600 Max. :3.580 Max. :13.000
## hue od280-od315-of-diluted-wines proline
## Min. :0.4800 Min. :1.270 Min. : 278.0
## 1st Qu.:0.7825 1st Qu.:1.938 1st Qu.: 500.5
## Median :0.9650 Median :2.780 Median : 673.5
## Mean :0.9574 Mean :2.612 Mean : 746.9
## 3rd Qu.:1.1200 3rd Qu.:3.170 3rd Qu.: 985.0
## Max. :1.7100 Max. :4.000 Max. :1680.0
Renommage des classes en 1: Chianti (C), 2: Montepulciano (M), 3: Valpolicella (V), exemple :
$cultivar tb
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [48] 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [95] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3
## [142] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## Levels: 1 2 3
as.factor(c('C','M','V')[ tb$cultivar ])
## [1] C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C C
## [48] C C C C C C C C C C C C M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M
## [95] M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M V V V V V V V V V V V
## [142] V V V V V V V V V V V V V V V V V V V V V V V V V V V V V V V V V V V V V
## Levels: C M V
Modification du tibble : fonction mutate
%>% mutate(class=as.factor( c('C','M','V'))[ cultivar ] ) tb
## # A tibble: 178 x 15
## cultivar alcohol `malic-acid` ash `alcalinity-of-… magnesium `total-phenols` flavonoids
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 14.2 1.71 2.43 15.6 127 2.8 3.06
## 2 1 13.2 1.78 2.14 11.2 100 2.65 2.76
## 3 1 13.2 2.36 2.67 18.6 101 2.8 3.24
## 4 1 14.4 1.95 2.5 16.8 113 3.85 3.49
## 5 1 13.2 2.59 2.87 21 118 2.8 2.69
## 6 1 14.2 1.76 2.45 15.2 112 3.27 3.39
## 7 1 14.4 1.87 2.45 14.6 96 2.5 2.52
## 8 1 14.1 2.15 2.61 17.6 121 2.6 2.51
## 9 1 14.8 1.64 2.17 14 97 2.8 2.98
## 10 1 13.9 1.35 2.27 16 98 2.98 3.15
## # … with 168 more rows, and 7 more variables: `nonflavonoid-phenols` <dbl>, proanthocyanins <dbl>,
## # `color-intensity` <dbl>, hue <dbl>, `od280-od315-of-diluted-wines` <dbl>, proline <dbl>,
## # class <fct>
Ceci ajoute une dernière colonne “class” à partir de la valeur de cultivar de chaque ligne.
Pour remplacer la valeur de cultivar par cette classe, on donne le même nom de colonne.
%<>% mutate(cultivar=as.factor( c('C','M','V'))[ cultivar ] )
tb tb
## # A tibble: 178 x 14
## cultivar alcohol `malic-acid` ash `alcalinity-of-… magnesium `total-phenols` flavonoids
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 C 14.2 1.71 2.43 15.6 127 2.8 3.06
## 2 C 13.2 1.78 2.14 11.2 100 2.65 2.76
## 3 C 13.2 2.36 2.67 18.6 101 2.8 3.24
## 4 C 14.4 1.95 2.5 16.8 113 3.85 3.49
## 5 C 13.2 2.59 2.87 21 118 2.8 2.69
## 6 C 14.2 1.76 2.45 15.2 112 3.27 3.39
## 7 C 14.4 1.87 2.45 14.6 96 2.5 2.52
## 8 C 14.1 2.15 2.61 17.6 121 2.6 2.51
## 9 C 14.8 1.64 2.17 14 97 2.8 2.98
## 10 C 13.9 1.35 2.27 16 98 2.98 3.15
## # … with 168 more rows, and 6 more variables: `nonflavonoid-phenols` <dbl>, proanthocyanins <dbl>,
## # `color-intensity` <dbl>, hue <dbl>, `od280-od315-of-diluted-wines` <dbl>, proline <dbl>
Remarque : L’utilisation de %<>%
de la librairie magrittr évite d’avoir à faire tb = tb %>%
La lib ggplot2 permet de produire des graphiques de bonnes qualités et repose sur bases saines pour la construction de graphique à partir d’un jeu de données. Consultez par exemple https://www.r-graph-gallery.com/
à la tidyverse/ggplot2 (encore pas très facile pour un camembert en 2021 donc on fait un barplot).
En x les classes et pour chaque classe, les effectifs, sous forme de barplot
%>%
tb ggplot(aes(x=cultivar, fill=cultivar)) +
geom_bar(stat = 'count')
Considérons la variable alcohol et sa distribution par classe : est-ce utile pour déterminer la classe d’un vin ?
%>%
tb ggplot() +
geom_boxplot(aes(x=cultivar, y=alcohol))
Il pest possible de rajouter des geometries
%>%
tb ggplot(aes(x=cultivar, y=alcohol)) +
geom_boxplot() +
geom_point()
Les points se supperpossent : faire de la transparence et/ou du jitter
%>%
tb ggplot(aes(x=cultivar, y=alcohol)) +
geom_boxplot() +
geom_jitter(width=0.05, alpha=0.2)
Plus moderne avec des violin plots qui tracent la courbe de densité à la place de la boiîte à moustaches
%>%
tb ggplot(aes(x=cultivar, y=alcohol)) +
geom_violin(aes(fill=cultivar)) +
geom_jitter(width=0.05, alpha=0.2) +
theme_bw() +
ggtitle("Distributions des degrés alcoolique par classe de vin")
Calcul des moyennes et écart-types par classe → group_by et summarise
= tb %>%
alcohol group_by(cultivar) %>%
summarise(alcohol.mean = mean(alcohol), alcohol.sd = sd(alcohol))
alcohol
## # A tibble: 3 x 3
## cultivar alcohol.mean alcohol.sd
## * <fct> <dbl> <dbl>
## 1 C 13.7 0.462
## 2 M 12.3 0.538
## 3 V 13.2 0.530
Manip pour avoir des tuples (classe variable value)
= tb %>% rowid_to_column() %>%
tbg pivot_longer(alcohol:proline, names_to='variable', values_to='value')
tbg
## # A tibble: 2,314 x 4
## rowid cultivar variable value
## <int> <fct> <chr> <dbl>
## 1 1 C alcohol 14.2
## 2 1 C malic-acid 1.71
## 3 1 C ash 2.43
## 4 1 C alcalinity-of-ash 15.6
## 5 1 C magnesium 127
## 6 1 C total-phenols 2.8
## 7 1 C flavonoids 3.06
## 8 1 C nonflavonoid-phenols 0.28
## 9 1 C proanthocyanins 2.29
## 10 1 C color-intensity 5.64
## # … with 2,304 more rows
rowid
sera nécessaire si on veut revenir au format de départ sinon impossible de savoir quelle mesures (d’alcool et de ash) il faut rassembler pour reconstruire le tableau de départ.
%>%
tb pivot_longer(alcohol:proline, names_to='variable', values_to='value') %>%
ggplot(aes(cultivar, value, color=cultivar)) +
geom_boxplot() +
facet_wrap(~ variable)
On remarque que proline écrase le reste.
Plot interactif
= tb %>%
p pivot_longer(alcohol:proline, names_to='variable', values_to='value') %>%
ggplot(aes(cultivar, value, color=cultivar)) +
geom_boxplot() +
facet_wrap(~ variable)
ggplotly(p)
Il va donc être judicieux de normaliser les données. Nous allons utiliser une normalisation z-score qui consiste à centrer-réduire la matrice de données.
Normalisation (z-score)
= tbg %>%
znorm group_by(variable) %>%
summarize(mean=mean(value), sd=sd(value), min = min(value), max=max(value), median=median(value))
znorm
## # A tibble: 13 x 6
## variable mean sd min max median
## * <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 alcalinity-of-ash 19.5 3.34 10.6 30 19.5
## 2 alcohol 13.0 0.812 11.0 14.8 13.0
## 3 ash 2.37 0.274 1.36 3.23 2.36
## 4 color-intensity 5.06 2.32 1.28 13 4.69
## 5 flavonoids 2.03 0.999 0.34 5.08 2.13
## 6 hue 0.957 0.229 0.48 1.71 0.965
## 7 magnesium 99.7 14.3 70 162 98
## 8 malic-acid 2.34 1.12 0.74 5.8 1.87
## 9 nonflavonoid-phenols 0.362 0.124 0.13 0.66 0.34
## 10 od280-od315-of-diluted-wines 2.61 0.710 1.27 4 2.78
## 11 proanthocyanins 1.59 0.572 0.41 3.58 1.56
## 12 proline 747. 315. 278 1680 674.
## 13 total-phenols 2.30 0.626 0.98 3.88 2.36
jointure et ajout des z-score
%<>% inner_join(znorm, by='variable') %>% mutate(value.z = (value-mean)/sd)
tbg tbg
## # A tibble: 2,314 x 10
## rowid cultivar variable value mean sd min max median value.z
## <int> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 C alcohol 14.2 13.0 0.812 11.0 14.8 13.0 1.51
## 2 1 C malic-acid 1.71 2.34 1.12 0.74 5.8 1.87 -0.561
## 3 1 C ash 2.43 2.37 0.274 1.36 3.23 2.36 0.231
## 4 1 C alcalinity-of-ash 15.6 19.5 3.34 10.6 30 19.5 -1.17
## 5 1 C magnesium 127 99.7 14.3 70 162 98 1.91
## 6 1 C total-phenols 2.8 2.30 0.626 0.98 3.88 2.36 0.807
## 7 1 C flavonoids 3.06 2.03 0.999 0.34 5.08 2.13 1.03
## 8 1 C nonflavonoid-phenols 0.28 0.362 0.124 0.13 0.66 0.34 -0.658
## 9 1 C proanthocyanins 2.29 1.59 0.572 0.41 3.58 1.56 1.22
## 10 1 C color-intensity 5.64 5.06 2.32 1.28 13 4.69 0.251
## # … with 2,304 more rows
Vérification de la normalisation (moyenne à 0 et écart-type à 1)
%>% group_by(variable) %>% summarize(m=round(mean(value.z), 4), sd=sd(value.z)) tbg
## # A tibble: 13 x 3
## variable m sd
## * <chr> <dbl> <dbl>
## 1 alcalinity-of-ash 0 1
## 2 alcohol 0 1
## 3 ash 0 1
## 4 color-intensity 0 1.00
## 5 flavonoids 0 1
## 6 hue 0 1.00
## 7 magnesium 0 1
## 8 malic-acid 0 1.00
## 9 nonflavonoid-phenols 0 1
## 10 od280-od315-of-diluted-wines 0 1
## 11 proanthocyanins 0 1.00
## 12 proline 0 1
## 13 total-phenols 0 1
%>% knitr::kable() znorm
variable | mean | sd | min | max | median |
---|---|---|---|---|---|
alcalinity-of-ash | 19.4949438 | 3.3395638 | 10.60 | 30.00 | 19.500 |
alcohol | 13.0006180 | 0.8118265 | 11.03 | 14.83 | 13.050 |
ash | 2.3665169 | 0.2743440 | 1.36 | 3.23 | 2.360 |
color-intensity | 5.0580899 | 2.3182859 | 1.28 | 13.00 | 4.690 |
flavonoids | 2.0292697 | 0.9988587 | 0.34 | 5.08 | 2.135 |
hue | 0.9574494 | 0.2285716 | 0.48 | 1.71 | 0.965 |
magnesium | 99.7415730 | 14.2824835 | 70.00 | 162.00 | 98.000 |
malic-acid | 2.3363483 | 1.1171461 | 0.74 | 5.80 | 1.865 |
nonflavonoid-phenols | 0.3618539 | 0.1244533 | 0.13 | 0.66 | 0.340 |
od280-od315-of-diluted-wines | 2.6116854 | 0.7099904 | 1.27 | 4.00 | 2.780 |
proanthocyanins | 1.5908989 | 0.5723589 | 0.41 | 3.58 | 1.555 |
proline | 746.8932584 | 314.9074743 | 278.00 | 1680.00 | 673.500 |
total-phenols | 2.2951124 | 0.6258510 | 0.98 | 3.88 | 2.355 |
Visu
%>%
tbg ggplot(aes(cultivar, value.z, color=cultivar)) +
geom_violin() +
facet_wrap(~ variable)
On y voit mieux.
Pour revenir au format de départ (une colonne par variable)
= tbg %>%
tbz select(rowid, cultivar, variable, value.z) %>%
pivot_wider(names_from = variable, values_from=value.z)
# tbz = tbg %>%
# select(cultivar_id, cultivar, variable, value.z) %>%
# spread(key = variable, value=value.z)
tbz
## # A tibble: 178 x 15
## rowid cultivar alcohol `malic-acid` ash `alcalinity-of-… magnesium `total-phenols` flavonoids
## <int> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 C 1.51 -0.561 0.231 -1.17 1.91 0.807 1.03
## 2 2 C 0.246 -0.498 -0.826 -2.48 0.0181 0.567 0.732
## 3 3 C 0.196 0.0212 1.11 -0.268 0.0881 0.807 1.21
## 4 4 C 1.69 -0.346 0.487 -0.807 0.928 2.48 1.46
## 5 5 C 0.295 0.227 1.84 0.451 1.28 0.807 0.661
## 6 6 C 1.48 -0.516 0.304 -1.29 0.858 1.56 1.36
## 7 7 C 1.71 -0.417 0.304 -1.47 -0.262 0.327 0.491
## 8 8 C 1.30 -0.167 0.888 -0.567 1.49 0.487 0.481
## 9 9 C 2.25 -0.623 -0.716 -1.65 -0.192 0.807 0.952
## 10 10 C 1.06 -0.883 -0.352 -1.05 -0.122 1.09 1.12
## # … with 168 more rows, and 6 more variables: `nonflavonoid-phenols` <dbl>, proanthocyanins <dbl>,
## # `color-intensity` <dbl>, hue <dbl>, `od280-od315-of-diluted-wines` <dbl>, proline <dbl>
%>%
tbz ggplot(aes(x=`alcalinity-of-ash`, y=alcohol)) +
geom_point()
Avec des couleurs et des formes
%>%
tbz ggplot(aes(x=`alcalinity-of-ash`, y=alcohol)) +
geom_point(size=3, aes(color=cultivar, shape=cultivar))
http://larmarange.github.io/analyse-R/graphiques-bivaries-ggplot2.html
La librairies ggally permet de faire les analyses bivariées sous forme d’un seul plot
%>%
tb ggpairs(aes(color=cultivar, alpha=0.1))
La même sur les données normalisées
%>% select(-rowid) %>%
tbz ggpairs(aes(color=cultivar, alpha=.1))
matrice de corrélations
%>%
tb ggcorr()
## Warning in ggcorr(.): data in column(s) 'cultivar' are not numeric and were ignored
Est-ce que des groupes se distinguent à partir des données ?
= tb %>%
tb.acp select(-cultivar) %>%
%>%
as.matrix princomp(cor=T)
%>% summary tb.acp
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## Standard deviation 2.1692972 1.5801816 1.2025273 0.9586313 0.92370351 0.80103498 0.74231281
## Proportion of Variance 0.3619885 0.1920749 0.1112363 0.0706903 0.06563294 0.04935823 0.04238679
## Cumulative Proportion 0.3619885 0.5540634 0.6652997 0.7359900 0.80162293 0.85098116 0.89336795
## Comp.8 Comp.9 Comp.10 Comp.11 Comp.12 Comp.13
## Standard deviation 0.59033665 0.53747553 0.50090167 0.47517222 0.41081655 0.321524394
## Proportion of Variance 0.02680749 0.02222153 0.01930019 0.01736836 0.01298233 0.007952149
## Cumulative Proportion 0.92017544 0.94239698 0.96169717 0.97906553 0.99204785 1.000000000
La variance portée par les axes semblent être assez prometteuse.
Ajout des coordonnées projettées dans le nouveau repère à 2 dimensions (2 premières composantes principales)
names(tb.acp$score[,1:2])=c('Comp1','Comp2')
$Comp1 = tb.acp$score[,1]
tb$Comp2 = tb.acp$score[,2] tb
visu
%>%
tb ggplot(aes(Comp1, Comp2, color=cultivar, shape=cultivar)) +
geom_point() +
ggtitle('ACP normée princomp wine')
C’est prometteur (séparation des classes par l’utilisation d’une combinaison des variables).