Dataframe Summary

Dataset: https://github.com/abulbasar/data/blob/master/mobile-sales-data.csv

scala> val df = spark.read.options(Map("header"-> "true", "inferSchema" -> "true")).csv("/data/mobile-sales-data.csv")

scala> df.show

+-------+----+------+---------+

|Country| Age|Salary|Purchased|

+-------+----+------+---------+

| France| 44| 72000| No|

| Spain| 27| 48000| Yes|

|Germany| 30| 54000| No|

| Spain| 38| 61000| No|

|Germany| 40| null| Yes|

| France| 35| 58000| Yes|

| Spain|null| 52000| No|

| France| 48| 79000| Yes|

|Germany| 50| 83000| No|

| France| 37| 67000| Yes|

+-------+----+------+---------+

scala> df.describe().show

+-------+-------+-----------------+------------------+---------+

|summary|Country| Age| Salary|Purchased|

+-------+-------+-----------------+------------------+---------+

| count| 10| 9| 9| 10|

| mean| null|38.77777777777778| 63777.77777777778| null|

| stddev| null|7.693792591722529|12265.579661982732| null|

| min| France| 27| 48000| No|

| max| Spain| 50| 83000| Yes|

+-------+-------+-----------------+------------------+---------+

scala> val means = df.describe().filter("summary = 'mean'")

scala> means.show

+-------+-------+-----------------+-----------------+---------+

|summary|Country| Age| Salary|Purchased|

+-------+-------+-----------------+-----------------+---------+

| mean| null|38.77777777777778|63777.77777777778| null|

+-------+-------+-----------------+-----------------+---------+

scala> row.schema.fieldNames

res180: Array[String] = Array(summary, Country, Age, Salary, Purchased)

scala> val summary = row.getValuesMap[String](row.schema.fieldNames)

summary: Map[String,String] = Map(Country -> null, Purchased -> null, Age -> 38.77777777777778, Salary -> 63777.77777777778, summary -> mean)

scala> Array("Age", "Salary").map(k => (k, summary.get(k).get.toDouble)).toMap

res204: scala.collection.immutable.Map[String,Double] = Map(Age -> 38.77777777777778, Salary -> 63777.77777777778)

scala> df.na.fill(Array("Age", "Salary").map(k => (k, summary.get(k).get.toDouble)).toMap).show

+-------+---+------+---------+

|Country|Age|Salary|Purchased|

+-------+---+------+---------+

| France| 44| 72000| No|

| Spain| 27| 48000| Yes|

|Germany| 30| 54000| No|

| Spain| 38| 61000| No|

|Germany| 40| 63777| Yes|

| France| 35| 58000| Yes|

| Spain| 38| 52000| No|

| France| 48| 79000| Yes|

|Germany| 50| 83000| No|

| France| 37| 67000| Yes|

+-------+---+------+---------+