Complex join queries using Spark

$ bin/spark-shell --packages com.datastax.spark:spark-cassandra-connector_2.11:2.0.2 --conf spark.cassandra.connection.host=127.0.0.1 --verbose

val employees = spark

.read

.format("org.apache.spark.sql.cassandra")

.option("table", "employees")

.option("keyspace", "employees")

.load();

employees.registerTempTable("employees")

val salaries = spark

.read

.format("org.apache.spark.sql.cassandra")

.option("table", "salaries")

.option("keyspace", "employees")

.load();

salaries.registerTempTable("salaries")

scala> sql("select * from salaries limit 10").show

+------+----------+------+----------+

|emp_no| from_date|salary| to_date|

+------+----------+------+----------+

| 72712|2001-12-20| 69169|9999-01-01|

| 72712|2000-12-20| 65461|2001-12-20|

| 72712|1999-12-21| 62576|2000-12-20|

| 72712|1998-12-21| 61966|1999-12-21|

| 72712|1997-12-21| 61948|1998-12-21|

| 72712|1996-12-21| 60843|1997-12-21|

| 72712|1995-12-22| 59341|1996-12-21|

| 72712|1994-12-22| 59087|1995-12-22|

| 72712|1993-12-22| 56597|1994-12-22|

| 72712|1992-12-22| 55933|1993-12-22|

+------+----------+------+----------+

scala> sql("select * from employees limit 10").show

+------+----------+----------+------+----------+------------+

|emp_no|birth_date|first_name|gender| hire_date| last_name|

+------+----------+----------+------+----------+------------+

| 72712|1960-05-02| Claude| F|1988-12-23| Zedlitz|

|402236|1959-01-27| Ewing| F|1993-06-24| Copas|

|102142|1960-02-09|Yoshimitsu| F|1986-02-28| Nourani|

|484451|1964-04-12| Evgueni| F|1989-12-18| Iivonen|

|220597|1962-07-29| Godehard| M|1987-05-15| Katiyar|

| 41708|1961-03-07| Utpal| M|1985-03-21|Trachtenberg|

|439936|1962-05-21| Ishfaq| M|1987-01-12| Flowers|

| 44839|1953-07-17| Xudong| M|1986-12-08| Greibach|

| 68864|1961-08-25| Kshitij| M|1993-03-03| Ferriere|

|228740|1960-12-01| Sadun| F|1986-10-16| Tibblin|

+------+----------+----------+------+----------+------------+

sql("select year(t1.birth_date), avg(t2.salary) from employees t1 join salaries t2 on t1.emp_no = t2.emp_no group by year(t1.birth_date) limit 10").show

+----------------+------------------+

|year(birth_date)| avg(salary)|

+----------------+------------------+

| 1952| 64033.96343784632|

| 1953| 63667.26256464477|

| 1954| 63600.12015648286|

| 1955| 63926.54773794271|

| 1956|63750.155349708766|

| 1957| 63772.73717452653|

| 1958| 64033.91417091274|

| 1959| 63839.09144206498|

| 1960|63726.988640884585|

| 1961| 63797.31006067939|

| 1962| 63819.19306404951|

| 1963| 63853.39963146588|

| 1964| 63774.96963159489|

| 1965| 63313.95532309033|

+----------------+------------------+