ggplot
---
title: "ggplot"
output:
html_document: default
pdf_document: default
word_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
require(ggplot2)
require(caTools)
require(reshape2)
data("diamonds")
```
# Visualization of a single variable.
### Categorical Variable
```{r}
ggplot(diamonds, aes(color)) + geom_bar() + labs(title = "Count plot for categorical variable")
```
```{r}
ggplot(diamonds, aes(color)) + geom_bar(aes(fill = cut)) + labs(title = "Count plot for two categorical variables (stacked)")
```
```{r}
ggplot(diamonds, aes(color)) + geom_bar(aes(fill = cut), position = "dodge") + labs(title = "Count plot for two categorical variables (stacked)")
```
### Single continuous variable
```{r}
ggplot(diamonds, aes(carat)) + geom_histogram(bins = 50) + labs(title = "Histogram of single continuous variable")
```
### Single continuous variable
```{r}
ggplot(diamonds, aes(carat)) + geom_histogram(bins = 50, aes(y = ..density..)) +
labs(title = "Density plot of single continuous variable")
```
### Single continuous variable
```{r}
ggplot(diamonds, aes(carat)) +
geom_histogram(bins = 50, color = "black", aes(y=..density..), fill = "lightgrey") +
geom_density(fill = "red", alpha = 0.3) +
labs(title = "Density plot plus kernel desntity curve")
```
```{r}
set.seed(100)
splits = sample.split(diamonds$price)
diamonds$bucket = ""
diamonds[splits, ]$bucket = "training"
diamonds[!splits, ]$bucket = "testing"
str(diamonds)
```
```{r}
ggplot(diamonds, aes(carat)) + geom_histogram(bins = 50, aes(y = ..density.., fill = bucket), position = "dodge") + # position: dodge, identity
labs(title = "Histogram plot of two continuous variable side by side")
```
```{r}
ggplot(diamonds, aes("", price)) +
geom_boxplot() +
labs(title = "Boxplot of a single continuous variable")
```
```{r}
ggplot(diamonds, aes(bucket, price)) +
geom_boxplot() +
labs(title = "Boxplot of a two continuous variables")
```
# Association plot between 2 or more variables
```{r}
ggplot(aggregate(price ~ cut, diamonds, mean), aes(cut, price)) +
geom_bar(stat = "identity") +
labs(title = "Association between Continuous vs Categorical variables (bar chart)")
```
```{r}
ggplot(diamonds, aes(cut, price)) +
geom_boxplot() +
labs(title = "Association between Continuous vs Categorical variables\nShow distribution of cont var for each categorical var")
```
```{r}
ggplot(diamonds, aes(carat, price)) +
geom_point() +
geom_smooth() + #loess
labs(title = "Association between continuous vs continuous variables (scatterplot)")
```
```{r}
ggplot(aggregate(price ~ cut + color, diamonds, mean), aes(cut, color)) +
geom_tile(aes(fill = price)) +
scale_fill_gradient2(low = "white", high = "black")
```
For numeric columns, you can generate correlations.
```{r}
correlations = data.frame(cor(diamonds[, sapply(diamonds, is.numeric)]))
correlations$col = names(correlations)
correlations = melt(correlations, c("col"))
colnames(correlations) = c("color", "cut", "correlation")
ggplot(correlations,aes(color, cut)) +
geom_tile(aes(fill = correlation)) +
scale_fill_gradient2()
```
```{r}
require(corrplot)
corrplot(cor(diamonds[, sapply(diamonds, is.numeric)]), type = "upper", tl.cex = 1.2, diag = FALSE)
```