Complex join queries using Spark
$ bin/spark-shell --packages com.datastax.spark:spark-cassandra-connector_2.11:2.0.2 --conf spark.cassandra.connection.host=127.0.0.1 --verbose
val employees = spark
.read
.format("org.apache.spark.sql.cassandra")
.option("table", "employees")
.option("keyspace", "employees")
.load();
employees.registerTempTable("employees")
val salaries = spark
.read
.format("org.apache.spark.sql.cassandra")
.option("table", "salaries")
.option("keyspace", "employees")
.load();
salaries.registerTempTable("salaries")
scala> sql("select * from salaries limit 10").show
+------+----------+------+----------+
|emp_no| from_date|salary| to_date|
+------+----------+------+----------+
| 72712|2001-12-20| 69169|9999-01-01|
| 72712|2000-12-20| 65461|2001-12-20|
| 72712|1999-12-21| 62576|2000-12-20|
| 72712|1998-12-21| 61966|1999-12-21|
| 72712|1997-12-21| 61948|1998-12-21|
| 72712|1996-12-21| 60843|1997-12-21|
| 72712|1995-12-22| 59341|1996-12-21|
| 72712|1994-12-22| 59087|1995-12-22|
| 72712|1993-12-22| 56597|1994-12-22|
| 72712|1992-12-22| 55933|1993-12-22|
+------+----------+------+----------+
scala> sql("select * from employees limit 10").show
+------+----------+----------+------+----------+------------+
|emp_no|birth_date|first_name|gender| hire_date| last_name|
+------+----------+----------+------+----------+------------+
| 72712|1960-05-02| Claude| F|1988-12-23| Zedlitz|
|402236|1959-01-27| Ewing| F|1993-06-24| Copas|
|102142|1960-02-09|Yoshimitsu| F|1986-02-28| Nourani|
|484451|1964-04-12| Evgueni| F|1989-12-18| Iivonen|
|220597|1962-07-29| Godehard| M|1987-05-15| Katiyar|
| 41708|1961-03-07| Utpal| M|1985-03-21|Trachtenberg|
|439936|1962-05-21| Ishfaq| M|1987-01-12| Flowers|
| 44839|1953-07-17| Xudong| M|1986-12-08| Greibach|
| 68864|1961-08-25| Kshitij| M|1993-03-03| Ferriere|
|228740|1960-12-01| Sadun| F|1986-10-16| Tibblin|
+------+----------+----------+------+----------+------------+
sql("select year(t1.birth_date), avg(t2.salary) from employees t1 join salaries t2 on t1.emp_no = t2.emp_no group by year(t1.birth_date) limit 10").show
+----------------+------------------+
|year(birth_date)| avg(salary)|
+----------------+------------------+
| 1952| 64033.96343784632|
| 1953| 63667.26256464477|
| 1954| 63600.12015648286|
| 1955| 63926.54773794271|
| 1956|63750.155349708766|
| 1957| 63772.73717452653|
| 1958| 64033.91417091274|
| 1959| 63839.09144206498|
| 1960|63726.988640884585|
| 1961| 63797.31006067939|
| 1962| 63819.19306404951|
| 1963| 63853.39963146588|
| 1964| 63774.96963159489|
| 1965| 63313.95532309033|
+----------------+------------------+