Dataframe Summary

Dataset: https://github.com/abulbasar/data/blob/master/mobile-sales-data.csv

scala> val df = spark.read.options(Map("header"-> "true", "inferSchema" -> "true")).csv("/data/mobile-sales-data.csv")
scala> df.show
+-------+----+------+---------+
|Country| Age|Salary|Purchased|
+-------+----+------+---------+
| France|  44| 72000|       No|
|  Spain|  27| 48000|      Yes|
|Germany|  30| 54000|       No|
|  Spain|  38| 61000|       No|
|Germany|  40|  null|      Yes|
| France|  35| 58000|      Yes|
|  Spain|null| 52000|       No|
| France|  48| 79000|      Yes|
|Germany|  50| 83000|       No|
| France|  37| 67000|      Yes|
+-------+----+------+---------+
scala> df.describe().show
+-------+-------+-----------------+------------------+---------+
|summary|Country|              Age|            Salary|Purchased|
+-------+-------+-----------------+------------------+---------+
|  count|     10|                9|                 9|       10|
|   mean|   null|38.77777777777778| 63777.77777777778|     null|
| stddev|   null|7.693792591722529|12265.579661982732|     null|
|    min| France|               27|             48000|       No|
|    max|  Spain|               50|             83000|      Yes|
+-------+-------+-----------------+------------------+---------+
scala> val means = df.describe().filter("summary = 'mean'")
scala> means.show
+-------+-------+-----------------+-----------------+---------+
|summary|Country|              Age|           Salary|Purchased|
+-------+-------+-----------------+-----------------+---------+
|   mean|   null|38.77777777777778|63777.77777777778|     null|
+-------+-------+-----------------+-----------------+---------+
scala> row.schema.fieldNames
res180: Array[String] = Array(summary, Country, Age, Salary, Purchased)
scala> val summary = row.getValuesMap[String](row.schema.fieldNames)
summary: Map[String,String] = Map(Country -> null, Purchased -> null, Age -> 38.77777777777778, Salary -> 63777.77777777778, summary -> mean)
scala> Array("Age", "Salary").map(k => (k, summary.get(k).get.toDouble)).toMap
res204: scala.collection.immutable.Map[String,Double] = Map(Age -> 38.77777777777778, Salary -> 63777.77777777778)
scala> df.na.fill(Array("Age", "Salary").map(k => (k, summary.get(k).get.toDouble)).toMap).show
+-------+---+------+---------+
|Country|Age|Salary|Purchased|
+-------+---+------+---------+
| France| 44| 72000| No|
| Spain| 27| 48000| Yes|
|Germany| 30| 54000| No|
| Spain| 38| 61000| No|
|Germany| 40| 63777| Yes|
| France| 35| 58000| Yes|
| Spain| 38| 52000| No|
| France| 48| 79000| Yes|
|Germany| 50| 83000| No|
| France| 37| 67000| Yes|
+-------+---+------+---------+