Visual Analytics Final Project: Portuguese Wine and Which Components Correlate to Higher Quality
Visual Analytics Final Project
Tyler House
2023-04-27
library(ggplot2)
library(ggcorrplot)
library(ggmap)
library(ggraph)
library(ggpubr)
library(tidyverse)
library(ggthemes)
library(dplyr)
library(readr)
library(tinytex)
library(latexpdf)
library(stats)
winedata <- read_csv("C:\\Users\\tyler\\Desktop\\Spring 2023\\Visual Analytics\\Final Project\\winequality-red.csv")
str(winedata)
## spc_tbl_ [1,599 × 12] (S3:
spec_tbl_df/tbl_df/tbl/data.frame)
## $ fixed
acidity : num [1:1599] 7.4 7.8 7.8
11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile
acidity : num [1:1599] 0.7 0.88 0.76
0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric
acid : num [1:1599] 0 0 0.04 0.56
0 0 0.06 0 0.02 0.36 ...
## $ residual
sugar : num [1:1599] 1.9 2.6 2.3 1.9
1.9 1.8 1.6 1.2 2 6.1 ...
## $
chlorides : num [1:1599] 0.076
0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free
sulfur dioxide : num [1:1599] 11 25 15 17 11 13 15 15 9 17 ...
## $ total
sulfur dioxide: num [1:1599] 34 67 54 60 34 40 59 21 18 102 ...
## $
density : num [1:1599] 0.998
0.997 0.997 0.998 0.998 ...
## $ pH : num [1:1599] 3.51 3.2 3.26
3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $
sulphates : num [1:1599] 0.56
0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $
alcohol : num [1:1599] 9.4
9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $
quality : num [1:1599] 5 5 5
6 5 5 5 7 7 5 ...
## - attr(*,
"spec")=
## .. cols(
## .. `fixed acidity` = col_double(),
## .. `volatile acidity` = col_double(),
## .. `citric acid` = col_double(),
## .. `residual sugar` = col_double(),
## .. chlorides = col_double(),
## .. `free sulfur dioxide` = col_double(),
## .. `total sulfur dioxide` = col_double(),
## .. density = col_double(),
## .. pH = col_double(),
## .. sulphates = col_double(),
## .. alcohol = col_double(),
## .. quality = col_double()
## .. )
## - attr(*,
"problems")=<externalptr>
summary(winedata)
## fixed
acidity volatile acidity citric acid
residual sugar
## Min. : 4.60
Min. :0.1200 Min.
:0.000 Min. : 0.900
## 1st Qu.:
7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median :
7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32
Mean :0.5278 Mean
:0.271 Mean : 2.539
## 3rd Qu.:
9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90
Max. :1.5800 Max.
:1.000 Max. :15.500
##
chlorides free sulfur
dioxide total sulfur dioxide
density
## Min. :0.01200
Min. : 1.00 Min.
: 6.00 Min.
:0.9901
## 1st
Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747
Mean :15.87 Mean
: 46.47 Mean :0.9967
## 3rd
Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100
Max. :72.00 Max.
:289.00 Max. :1.0037
##
pH sulphates alcohol quality
## Min. :2.740
Min. :0.3300 Min.
: 8.40 Min. :3.000
## 1st
Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median
:3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311
Mean :0.6581 Mean
:10.42 Mean :5.636
## 3rd
Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010
Max. :2.0000 Max.
:14.90 Max. :8.000
head(winedata, n = 4)
## # A tibble: 4 × 12
## `fixed
acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
##
<dbl>
<dbl>
<dbl> <dbl> <dbl>
## 1
7.4 0.7 0 1.9 0.076
## 2
7.8 0.88 0 2.6 0.098
## 3
7.8 0.76 0.04 2.3 0.092
## 4
11.2 0.28 0.56 1.9 0.075
## # ℹ 7 more variables: `free sulfur dioxide`
<dbl>,
## # `total
sulfur dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates
<dbl>,
## # alcohol
<dbl>, quality <dbl>
#For this numerical analysis we will
run correlation tests between all variables and the quality index column which
is on a scale of 3-8 from this dataset. From the results we will select the
three variables with the greatest correlation under the Pearson statistical
measure, and we will visualize these variables in Part Two.
#Correlation Analysis
##Fixed Acidity and Quality Index
fac <- cor(winedata$`fixed acidity`,winedata$quality, method = "pearson", use = "complete.obs")
##Volatile Acidity and Quality Index
vac <- cor(winedata$`volatile acidity`,winedata$quality, method = "pearson", use = "complete.obs")
##Citric Acid Concentration and Quality
Index
cac <- cor(winedata$`citric acid`,winedata$quality, method = "pearson", use = "complete.obs")
##Residual Sugar and Quality Index
ras <- cor(winedata$`residual sugar`,winedata$quality, method = "pearson", use = "complete.obs")
##Chloride Concentration and Quality Index
cc <- cor(winedata$chlorides,winedata$quality, method
=
"pearson", use = "complete.obs")
##Free Sulfur Dioxide Concentration and
Quality Index
fsd <- cor(winedata$`free sulfur dioxide`,winedata$quality, method = "pearson", use = "complete.obs")
##Total Sulfur Dioxide Concentration and
Quality Index
tsd <- cor(winedata$`total sulfur dioxide`,winedata$quality, method = "pearson", use = "complete.obs")
##Density and Quality Index
dd <- cor(winedata$density,winedata$quality, method
=
"pearson", use = "complete.obs")
##pH and Quality Index
phc <- cor(winedata$pH,winedata$quality, method
=
"pearson", use = "complete.obs")
##Sulphate Concentration and Quality Index
scc <- cor(winedata$sulphates,winedata$quality, method
=
"pearson", use = "complete.obs")
##Alcohol Concentration and Quality Index
ac <- cor(winedata$alcohol,winedata$quality, method
=
"pearson", use = "complete.obs")
##Now we will compile these correlations
into a table
correlationdata <- c("ac","cac","cc","dd","fac","fsd","phc","ras","scc","tsd","vac")
r <- c(ac,cac,cc,dd,fac,fsd,phc,ras,scc,tsd,vac)
correlationtable <- data.frame(correlationdata,r)
colnames(correlationtable)
<- c("Compositional
Element", "Pearson Coefficient
(r)")
#Correlation Bar Plot Source
pearsonbarplot <- ggplot(correlationtable, aes(x =
correlationtable$`Compositional Element`, y =
correlationtable$`Pearson Coefficient`)) +
geom_bar(stat = "identity", fill = "#009d86", color = "black") +
labs(y = "Pearson
Coefficient (r)", x = "Compositional Element of the Wine") +
theme_minimal()
pearsonbarplot
#The results of the Pearson
Correlation Analysis show that the Citric Acid Concentration, Sulphates
Concentration and Alcohol Content are the most positively correlated with
Quality.
#Using the p-values now, we are
going to make a correlellogram so to speak, of the different variables against
eachother. With the use of hierarchical clustering I can view the independant
relationships of each individual variable against one another and in reference
to the data found previously I can infer which combinations of elements might
contribute to lower quality
correlation <- round(cor(winedata),1)
c.matrix <- cor_pmat(winedata)
lowercorrelation <- ggcorrplot(correlation, hc.order = TRUE, lab= TRUE, type = "lower", method = "circle",colors = c("#6D9EC1", "white", "#E46726"))
highercorrelation <- ggcorrplot(correlation, hc.order = TRUE, lab= TRUE, type = "upper")
##Lower Half Correlation
lowercorrelation
##Upper Half Correlation
highercorrelation
#Statistical Results:
##The elements with the highest correlation
to quality of the wine from this data set are the Citric Acid, Sulfate and
Alcohol Concentrations. Without having to ask, I know why higher alcohol makes
a better wine, who doesn't like a little more buzz per serving right? As for
the other two components, I found in some research outside the data set that
sulfates and sulfites (because of the presence of Sulfur Dioxide) in wine are
the preservation agent and the enhancer of flavor in a multitude of ways. It
was said from a few articles written from winemakers, that these elements are
found as a byproduct of yeast fermentation which is a process that makes any
number of ethyl alcohol derivatives. Chemically, the presence of greater
amounts of sulfur based byproducts is related to a higher alcohol production
and therefore, a higher quality on average.
##The next element was Citric Acid and
these days, people are getting testier and testier, with their personalities
and mannerisms, especially with their drinks of choice. Citric Acid is a
compound that is notably bitter when introduced in higher amounts to any drinks
composition but is used mainly to prevent ferric hazes within the wines and
also to produce more flavor to flatter, sweeter wines that are more common of the
Iberian Peninsula. It adds a sharpness to the flavor and also allows for
microbial agents to metabolize and ferment at a higher rate which produces more
of the sulfates and alcohol.
##The final element, and the element that
is most heavily correlated with the previous two would be the concentration of
alcohol within the wine. The Sulfur byproducts and citric acid concentrations
relative to one another and the other compounds will assist in driving the
alcohol gravity higher, as free ethyl alcohols are free to bind with sulfurs
and other free compounds, as well as the citric acid being able to allow
microbes to metabolize and produce more alcohol quicker.
#In my time in Spain and Portugal, many of the
wine I had was of a bittersweet and strong profile, with a heavy alcohol
content and a sour aftertaste. I am sure now that the experience I had was in
part to the data taken from wines of the same regional profile that I
experienced while on vacation
#My final say, is the sommelier was correct,
simple acid levels, sulfur byproduct levels and alcohol concentrations being
the highest correlated with wine qualities. But, from my experience I want to
clarify that the wines that I tasted that were tastier, happened to be higher
in alcohol. As the alcohol content rose, the body of the wine covered more
flavor profiles, which backs up my research stating that the other compounds
tend to be related to alcohol production quantities and rates.
#Overall the initial hypothesis stands to be
partially correct from this data based on the variables they share, such as
acidity and alcohol content, however the data added to what the sommelier said,
by showing a high correlation of sulfites and sulfates to the other variables
and to overall quality. Sugar being one of his talking points, when compared to
the data in this analysis, showed an almost non existent correlation with p
< 0.02. Regardless, his information was valid to an extent and therefore
leaves more questions to what relationships of compounds are optimal for a truly
fine wine.
##Short Summary
#This relates to the work in this class because
it is always possible to find correlations in any data and make inferences on
that data, if that it is cleaned enough and broad enough to cover multiple
variables and above all else, is able to show a wide array of population
members in the data set that make for a solid analysis, I wanted to have fun
with an interest of mine, by finding real world data to work with and test
against a fond experience of my past, I took that trip 6 years ago and I am
still very fond of the times I have spent there, and I have been back since. It
relates to the methods of this course because essentially, the point of a
visualization is to see the trends and relationships within data rather than just
a table. It allows for us to possibly see undiscovered trends to test
numerically that can help enrich our understanding of the data we might be
working with. This was the underlying fundamertals to this analysis and that
brings it to the end, thank you for a wonderful semester Dr. Friedman !!
Comments
Post a Comment