ggplot

---

title: "ggplot"

output:

html_document: default

pdf_document: default

word_document: default

---

```{r setup, include=FALSE}

knitr::opts_chunk$set(echo = TRUE)

```

```{r}

require(ggplot2)

require(caTools)

require(reshape2)

data("diamonds")

```

# Visualization of a single variable.

### Categorical Variable

```{r}

ggplot(diamonds, aes(color)) + geom_bar() + labs(title = "Count plot for categorical variable")

```

```{r}

ggplot(diamonds, aes(color)) + geom_bar(aes(fill = cut)) + labs(title = "Count plot for two categorical variables (stacked)")

```

```{r}

ggplot(diamonds, aes(color)) + geom_bar(aes(fill = cut), position = "dodge") + labs(title = "Count plot for two categorical variables (stacked)")

```

### Single continuous variable

```{r}

ggplot(diamonds, aes(carat)) + geom_histogram(bins = 50) + labs(title = "Histogram of single continuous variable")

```

### Single continuous variable

```{r}

ggplot(diamonds, aes(carat)) + geom_histogram(bins = 50, aes(y = ..density..)) +

labs(title = "Density plot of single continuous variable")

```

### Single continuous variable

```{r}

ggplot(diamonds, aes(carat)) +

geom_histogram(bins = 50, color = "black", aes(y=..density..), fill = "lightgrey") +

geom_density(fill = "red", alpha = 0.3) +

labs(title = "Density plot plus kernel desntity curve")

```

```{r}

set.seed(100)

splits = sample.split(diamonds$price)

diamonds$bucket = ""

diamonds[splits, ]$bucket = "training"

diamonds[!splits, ]$bucket = "testing"

str(diamonds)

```

```{r}

ggplot(diamonds, aes(carat)) + geom_histogram(bins = 50, aes(y = ..density.., fill = bucket), position = "dodge") + # position: dodge, identity

labs(title = "Histogram plot of two continuous variable side by side")

```

```{r}

ggplot(diamonds, aes("", price)) +

geom_boxplot() +

labs(title = "Boxplot of a single continuous variable")

```

```{r}

ggplot(diamonds, aes(bucket, price)) +

geom_boxplot() +

labs(title = "Boxplot of a two continuous variables")

```

# Association plot between 2 or more variables

```{r}

ggplot(aggregate(price ~ cut, diamonds, mean), aes(cut, price)) +

geom_bar(stat = "identity") +

labs(title = "Association between Continuous vs Categorical variables (bar chart)")

```

```{r}

ggplot(diamonds, aes(cut, price)) +

geom_boxplot() +

labs(title = "Association between Continuous vs Categorical variables\nShow distribution of cont var for each categorical var")

```

```{r}

ggplot(diamonds, aes(carat, price)) +

geom_point() +

geom_smooth() + #loess

labs(title = "Association between continuous vs continuous variables (scatterplot)")

```

```{r}

ggplot(aggregate(price ~ cut + color, diamonds, mean), aes(cut, color)) +

geom_tile(aes(fill = price)) +

scale_fill_gradient2(low = "white", high = "black")

```

For numeric columns, you can generate correlations.

```{r}

correlations = data.frame(cor(diamonds[, sapply(diamonds, is.numeric)]))

correlations$col = names(correlations)

correlations = melt(correlations, c("col"))

colnames(correlations) = c("color", "cut", "correlation")

ggplot(correlations,aes(color, cut)) +

geom_tile(aes(fill = correlation)) +

scale_fill_gradient2()

```

```{r}

require(corrplot)

corrplot(cor(diamonds[, sapply(diamonds, is.numeric)]), type = "upper", tl.cex = 1.2, diag = FALSE)

```