Hive - Window Functions

Create stocks table in hive. You can download the data from this link.

hive> create external table stocks(

tr_date String,

open Double,

high Double,

low Double,

close Double,

volume Double,

adjClose Double,

symbol String)

row format delimited

fields terminated by ','

stored as textfile

location '/user/cloudera/stocks'

TBLPROPERTIES("skip.header.line.count"="1");

Here is another way to define the table using hive serde

hive> CREATE EXTERNAL TABLE stocks(

tr_date String,

open Double,

high Double,

low Double,

close Double,

volume Double,

adjClose Double,

symbol String)

ROW FORMAT SERDE

'org.apache.hadoop.hive.serde2.OpenCSVSerde'

WITH SERDEPROPERTIES (

'separatorChar' = ',',

'quoteChar' = '"',

'escapeChar' = '\\')

STORED AS INPUTFORMAT

'org.apache.hadoop.mapred.TextInputFormat'

OUTPUTFORMAT

'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'

LOCATION

'/user/cloudera/stocks'

TBLPROPERTIES (

'serialization.null.format' = '',

'skip.header.line.count' = '1');

Describe the table schema

hive> describe formatted stocks;

Randomly select 10 rows.

hive> select * from stocks order by rand() limit 10;

+--------------+--------------+--------------+-------------+---------------+----------------+------------------+----------------+--+

| stocks.date | stocks.open | stocks.high | stocks.low | stocks.close | stocks.volume | stocks.adjclose | stocks.symbol |

+--------------+--------------+--------------+-------------+---------------+----------------+------------------+----------------+--+

| 2013-01-25 | 18.99 | 19.02 | 18.49 | 18.620001 | 3184300.0 | 17.681588 | SEE |

| 2010-04-05 | 24.73 | 25.5 | 24.66 | 25.389999 | 1.42255E7 | 21.731701 | TXN |

| 2011-04-04 | 33.580002 | 34.07 | 33.580002 | 33.970001 | 471600.0 | 16.985001 | CNC |

| 2014-03-27 | 18.59 | 18.91 | 18.389999 | 18.68 | 6906300.0 | 18.025039 | PHM |

| 2000-08-04 | 86.5 | 86.5 | 83.5 | 85.937523 | 3631600.0 | 21.484381 | EA |

| 2014-12-24 | 41.419998 | 41.529999 | 40.959999 | 41.130001 | 1426900.0 | 40.373814 | A |

| 2011-05-26 | 23.9 | 24.35 | 23.58 | 24.23 | 4974400.0 | 21.041444 | IVZ |

| 2005-05-04 | 11.65 | 11.87 | 11.63 | 11.72 | 3.76341E7 | 10.804011 | ORCL |

| 2009-12-29 | 20.049999 | 20.049999 | 19.75 | 19.76 | 657600.0 | 9.88 | LKQ |

| 2003-02-07 | 25.509996 | 25.740006 | 25.010005 | 25.109995 | 620200.0 | 15.840485 | DOV |

+--------------+--------------+--------------+-------------+---------------+----------------+------------------+----------------+--+

Find top 1 stocks each year according to transaction volume.

hive> select t.* from (select stocks.*, row_number() over (partition by year(tr_date) order by volume desc) row_num from stocks) t where row_num = 1;

+-------------+-------------+------------+-------------+-------------+--------------+-------------+-----------+------------+--+

| t.date | t.open | t.high | t.low | t.close | t.volume | t.adjclose | t.symbol | t.row_num |

+-------------+-------------+------------+-------------+-------------+--------------+-------------+-----------+------------+--+

| 2000-09-29 | 28.187501 | 29.0 | 25.375 | 25.75 | 1.8554102E9 | 1.683913 | AAPL | 1 |

| 2001-04-19 | 25.550001 | 25.75 | 23.599999 | 25.72 | 4.684176E8 | 1.681951 | AAPL | 1 |

| 2002-07-19 | 58.199998 | 59.949998 | 56.659999 | 56.919997 | 5.01891E8 | 5.989057 | EBAY | 1 |

| 2003-05-05 | 14.769999 | 16.880001 | 14.75 | 16.089999 | 3.88927E8 | 1.0522 | AAPL | 1 |

| 2004-10-14 | 43.190001 | 45.749999 | 42.549999 | 44.980001 | 6.921068E8 | 2.941452 | AAPL | 1 |

| 2005-01-13 | 73.709998 | 74.420001 | 69.73 | 69.800002 | 7.911792E8 | 4.564548 | AAPL | 1 |

| 2006-04-06 | 68.300001 | 72.05 | 68.199999 | 71.240002 | 6.659422E8 | 9.317433 | AAPL | 1 |

| 2007-01-09 | 86.450003 | 92.979999 | 85.15 | 92.570003 | 8.373246E8 | 12.107169 | AAPL | 1 |

| 2008-01-23 | 136.190006 | 140.0 | 126.140003 | 139.070005 | 8.432424E8 | 18.188874 | AAPL | 1 |

| 2009-12-04 | 15.76 | 16.4 | 15.62 | 16.280001 | 1.2267913E9 | 15.608195 | BAC | 1 |

| 2010-10-20 | 11.6 | 11.88 | 11.17 | 11.75 | 6.551166E8 | 11.288341 | BAC | 1 |

| 2011-08-25 | 8.29 | 8.8 | 7.38 | 7.65 | 8.596434E8 | 7.367703 | BAC | 1 |

| 2012-03-19 | 9.78 | 10.1 | 9.51 | 9.53 | 6.694799E8 | 9.219125 | BAC | 1 |

| 2013-07-25 | 33.540001 | 34.880001 | 32.75 | 34.360001 | 3.654579E8 | 34.360001 | FB | 1 |

| 2014-02-24 | 47.02 | 47.200001 | 46.23 | 46.23 | 6.166205E8 | 41.278369 | VZ | 1 |

| 2015-11-17 | 30.57 | 30.75 | 30.0 | 30.32 | 4.313326E8 | 29.631406 | GE | 1 |

| 2016-02-11 | 11.46 | 11.55 | 10.99 | 11.16 | 3.750887E8 | 11.08011 | BAC | 1 |

+-------------+-------------+------------+-------------+-------------+--------------+-------------+-----------+------------+--+

Note: Check out the difference between row_number, rank, dense_rank and ntile in this blog.

Display a new column that contains the average volume of a stock for each symbol calculated over the corresponding year.

hive> select tr_date, symbol, volume, avg(volume) over (partition by symbol, year(tr_date)) avg_vol_by_symbol_year from stocks limit 10;

+-------------+---------+------------+-------------------------+--+

| date | symbol | volume | avg_vol_by_symbol_year |

+-------------+---------+------------+-------------------------+--+

| 2000-10-25 | A | 5268500.0 | 4730647.863247863 |

| 2000-09-12 | A | 2830300.0 | 4730647.863247863 |

| 2000-12-07 | A | 3059900.0 | 4730647.863247863 |

| 2000-09-29 | A | 3677400.0 | 4730647.863247863 |

| 2000-12-06 | A | 4366900.0 | 4730647.863247863 |

| 2000-11-29 | A | 4385100.0 | 4730647.863247863 |

| 2000-09-07 | A | 2123000.0 | 4730647.863247863 |

| 2000-10-05 | A | 3998800.0 | 4730647.863247863 |

| 2000-08-07 | A | 5560800.0 | 4730647.863247863 |

| 2000-07-26 | A | 1.13853E7 | 4730647.863247863 |

+-------------+---------+------------+-------------------------+--+

Find first value, last value, max and mean for each stock symbol over each year after 2013.

hive> FROM (SELECT symbol, year(tr_date) year,

First_value(adjclose) OVER (partition BY symbol, Year(tr_date)) first_close,

Last_value(adjclose) OVER (partition BY symbol, Year(tr_date)) last_close,

Min(adjclose) OVER (partition BY symbol, Year(tr_date)) min_close,

Max(adjclose) OVER (partition BY symbol, Year(tr_date)) max_close,

row_number() OVER (partition BY symbol, Year(tr_date)) row_num

FROM stocks) t

SELECT t.*

WHERE row_num = 1 and year > 2013

LIMIT 10;

+-----------+---------+----------------+---------------+--------------+--------------+------------+--+

| t.symbol | t.year | t.first_close | t.last_close | t.min_close | t.max_close | t.row_num |

+-----------+---------+----------------+---------------+--------------+--------------+------------+--+

| A | 2014 | 36.70172 | 41.954214 | 36.308503 | 42.482843 | 1 |

| A | 2015 | 41.446851 | 40.986663 | 33.095757 | 42.959547 | 1 |

| A | 2016 | 41.20144 | 46.950001 | 34.608383 | 48.439999 | 1 |

| AA | 2014 | 11.074719 | 12.146152 | 9.811926 | 17.244273 | 1 |

| AA | 2015 | 8.978608 | 12.465825 | 7.741204 | 16.755247 | 1 |

| AA | 2016 | 9.302878 | 10.290001 | 6.672086 | 11.244435 | 1 |

| AAL | 2014 | 34.388208 | 52.5113 | 24.803327 | 52.717731 | 1 |

| AAL | 2015 | 42.243244 | 44.109016 | 37.098867 | 54.922539 | 1 |

| AAL | 2016 | 40.680279 | 35.922479 | 25.194591 | 43.215343 | 1 |

| AAP | 2014 | 124.49275 | 135.906821 | 109.294673 | 160.851334 | 1 |

+-----------+---------+----------------+---------------+--------------+--------------+------------+--+

Display previous day adjclose and next day adjusted close for each day of stock in 2016 for each symbol.

hive> select symbol, tr_date, adjclose,

lag(adjclose, 1) over (partition by symbol order by tr_date asc) prev_day_close,

lead(adjclose, 1) over (partition by symbol order by tr_date asc) next_day_close

from stocks where year(tr_date) = 2016 limit 10;

+---------+-------------+------------+-----------------+-----------------+--+

| symbol | date | adjclose | prev_day_close | next_day_close |

+---------+-------------+------------+-----------------+-----------------+--+

| A | 2016-01-04 | 40.465951 | NULL | 40.326722 |

| A | 2016-01-05 | 40.326722 | 40.465951 | 40.505731 |

| A | 2016-01-06 | 40.505731 | 40.326722 | 38.785258 |

| A | 2016-01-07 | 38.785258 | 40.505731 | 38.377515 |

| A | 2016-01-08 | 38.377515 | 38.785258 | 37.731093 |

| A | 2016-01-11 | 37.731093 | 38.377515 | 37.979716 |

| A | 2016-01-12 | 37.979716 | 37.731093 | 36.657041 |

| A | 2016-01-13 | 36.657041 | 37.979716 | 37.402912 |

| A | 2016-01-14 | 37.402912 | 36.657041 | 36.905665 |

| A | 2016-01-15 | 36.905665 | 37.402912 | 36.995169 |

+---------+-------------+------------+-----------------+-----------------+--+

Find stocks that have given the highest return in 52 week window as of 1st of Jul 2016

hive> select

*,

(adjclose - adjclose52) / adjclose as change

from

(select

symbol,

tr_date,

adjclose,

lag(tr_date, 250) over (partition by symbol order by tr_date) date52,

lag(adjclose, 250) over (partition by symbol order by tr_date) adjclose52

from stocks) t

where tr_date = '2016-07-01'

order by change desc;

+------+----------+----------+----------+----------+-------------------+

|symbol| date| adjclose| date52|adjclose52| change|

+------+----------+----------+----------+----------+-------------------+

| NVDA|2016-07-01| 46.66|2015-07-07| 19.510602| 0.581855936562366|

| NEM|2016-07-01| 40.34|2015-07-07| 22.308644|0.44698453148239964|

| AWK|2016-07-01| 83.71146|2015-07-07| 49.70205|0.40626946418088994|

| AMZN|2016-07-01|725.679993|2015-07-07|436.720001|0.39819203338571296|

| DLR|2016-07-01| 109.0|2015-07-07| 65.990661| 0.3945810917431192|

Reference:

  • https://cwiki.apache.org/confluence/display/Hive/LanguageManual+WindowingAndAnalytics