What are window functions and how do they work?
Answer
Window functions perform calculations across a set of table rows that are related to the current row, without collapsing the result set like aggregate functions do. They provide powerful analytical capabilities for ranking, running totals, and comparative analysis.
Basic Syntax
SELECT
column1,
column2,
WINDOW_FUNCTION() OVER (
[PARTITION BY column1, column2, ...]
[ORDER BY column3, column4, ...]
[ROWS/RANGE BETWEEN ... AND ...]
) AS result_column
FROM table_name;
Key Components
OVER Clause
Defines the window (set of rows) for the function to operate on.
PARTITION BY
Divides the result set into partitions (similar to GROUP BY but doesn’t collapse rows).
ORDER BY
Defines the logical order of rows within each partition.
Frame Specification
Defines which rows within the partition to include in the calculation.
Common Window Functions
1. ROW_NUMBER()
Assigns a unique sequential integer to each row.
-- Sample data
CREATE TABLE sales (
salesperson VARCHAR(50),
region VARCHAR(50),
sales_amount DECIMAL(10,2),
sale_date DATE
);
-- Assign row numbers
SELECT
salesperson,
region,
sales_amount,
ROW_NUMBER() OVER (ORDER BY sales_amount DESC) as overall_rank,
ROW_NUMBER() OVER (PARTITION BY region ORDER BY sales_amount DESC) as region_rank
FROM sales;
2. RANK() and DENSE_RANK()
Assign ranks with different tie-handling behavior.
SELECT
salesperson,
sales_amount,
RANK() OVER (ORDER BY sales_amount DESC) as rank_with_gaps,
DENSE_RANK() OVER (ORDER BY sales_amount DESC) as dense_rank,
ROW_NUMBER() OVER (ORDER BY sales_amount DESC) as row_num
FROM sales;
-- Example output:
-- Name | Amount | RANK | DENSE_RANK | ROW_NUMBER
-- John | 1000 | 1 | 1 | 1
-- Jane | 1000 | 1 | 1 | 2 (tie)
-- Mike | 900 | 3 | 2 | 3 (RANK skips 2)
-- Sarah | 800 | 4 | 3 | 4
3. LAG() and LEAD()
Access data from previous or next rows.
SELECT
salesperson,
sale_date,
sales_amount,
LAG(sales_amount) OVER (ORDER BY sale_date) as previous_sale,
LEAD(sales_amount) OVER (ORDER BY sale_date) as next_sale,
sales_amount - LAG(sales_amount) OVER (ORDER BY sale_date) as change_from_previous
FROM sales
ORDER BY sale_date;
4. FIRST_VALUE() and LAST_VALUE()
Get the first or last value in the window.
SELECT
salesperson,
region,
sales_amount,
FIRST_VALUE(sales_amount) OVER (
PARTITION BY region
ORDER BY sales_amount DESC
) as highest_in_region,
LAST_VALUE(sales_amount) OVER (
PARTITION BY region
ORDER BY sales_amount DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) as lowest_in_region
FROM sales;
Aggregate Window Functions
Running Totals
SELECT
sale_date,
sales_amount,
SUM(sales_amount) OVER (ORDER BY sale_date) as running_total,
AVG(sales_amount) OVER (ORDER BY sale_date) as running_average
FROM sales
ORDER BY sale_date;
Moving Averages
SELECT
sale_date,
sales_amount,
AVG(sales_amount) OVER (
ORDER BY sale_date
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) as three_day_moving_avg
FROM sales
ORDER BY sale_date;
Frame Specifications
ROWS vs RANGE
-- ROWS: Physical number of rows
SELECT
sale_date,
sales_amount,
SUM(sales_amount) OVER (
ORDER BY sale_date
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
) as sum_3_rows
FROM sales;
-- RANGE: Logical range of values
SELECT
sale_date,
sales_amount,
SUM(sales_amount) OVER (
ORDER BY sales_amount
RANGE BETWEEN 100 PRECEDING AND 100 FOLLOWING
) as sum_similar_amounts
FROM sales;
Frame Boundaries
-- Different frame specifications
SELECT
salesperson,
sales_amount,
-- From start of partition to current row
SUM(sales_amount) OVER (
ORDER BY sales_amount
ROWS UNBOUNDED PRECEDING
) as cumulative_sum,
-- Last 3 rows including current
AVG(sales_amount) OVER (
ORDER BY sales_amount
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) as last_3_avg,
-- Entire partition
COUNT(*) OVER (
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) as total_count
FROM sales;
Practical Business Examples
Top N per Group
-- Top 3 salespeople per region
WITH ranked_sales AS (
SELECT
salesperson,
region,
SUM(sales_amount) as total_sales,
ROW_NUMBER() OVER (
PARTITION BY region
ORDER BY SUM(sales_amount) DESC
) as rank_in_region
FROM sales
GROUP BY salesperson, region
)
SELECT salesperson, region, total_sales
FROM ranked_sales
WHERE rank_in_region <= 3;
Percentage of Total
SELECT
salesperson,
region,
sales_amount,
ROUND(
sales_amount * 100.0 / SUM(sales_amount) OVER (PARTITION BY region),
2
) as pct_of_region_total,
ROUND(
sales_amount * 100.0 / SUM(sales_amount) OVER (),
2
) as pct_of_grand_total
FROM sales;
Year-over-Year Growth
WITH monthly_sales AS (
SELECT
YEAR(sale_date) as year,
MONTH(sale_date) as month,
SUM(sales_amount) as monthly_total
FROM sales
GROUP BY YEAR(sale_date), MONTH(sale_date)
)
SELECT
year,
month,
monthly_total,
LAG(monthly_total, 12) OVER (ORDER BY year, month) as same_month_last_year,
ROUND(
(monthly_total - LAG(monthly_total, 12) OVER (ORDER BY year, month)) * 100.0 /
LAG(monthly_total, 12) OVER (ORDER BY year, month),
2
) as yoy_growth_pct
FROM monthly_sales
ORDER BY year, month;
Customer Lifetime Value Analysis
WITH customer_orders AS (
SELECT
customer_id,
order_date,
order_amount,
ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY order_date) as order_sequence,
SUM(order_amount) OVER (
PARTITION BY customer_id
ORDER BY order_date
ROWS UNBOUNDED PRECEDING
) as cumulative_value
FROM orders
)
SELECT
customer_id,
order_sequence,
order_amount,
cumulative_value,
CASE
WHEN order_sequence = 1 THEN 'First Purchase'
WHEN cumulative_value >= 1000 THEN 'High Value'
WHEN cumulative_value >= 500 THEN 'Medium Value'
ELSE 'Low Value'
END as customer_segment
FROM customer_orders;
Advanced Patterns
Gaps and Islands
-- Find consecutive sales periods
WITH sales_with_groups AS (
SELECT
sale_date,
sales_amount,
sale_date - INTERVAL ROW_NUMBER() OVER (ORDER BY sale_date) DAY as group_date
FROM sales
WHERE sales_amount > 1000
)
SELECT
MIN(sale_date) as period_start,
MAX(sale_date) as period_end,
COUNT(*) as consecutive_days
FROM sales_with_groups
GROUP BY group_date
ORDER BY period_start;
Median Calculation
-- Calculate median using window functions
WITH ordered_sales AS (
SELECT
sales_amount,
ROW_NUMBER() OVER (ORDER BY sales_amount) as row_num,
COUNT(*) OVER () as total_count
FROM sales
)
SELECT
AVG(sales_amount) as median_sales
FROM ordered_sales
WHERE row_num IN (
(total_count + 1) / 2,
(total_count + 2) / 2
);
Performance Considerations
Indexing for Window Functions
-- Create appropriate indexes
CREATE INDEX idx_sales_region_amount ON sales(region, sales_amount DESC);
CREATE INDEX idx_sales_date ON sales(sale_date);
-- Efficient window function query
SELECT
salesperson,
region,
sales_amount,
RANK() OVER (PARTITION BY region ORDER BY sales_amount DESC) as rank
FROM sales;
Avoiding Repeated Calculations
-- Instead of multiple similar window functions
SELECT
salesperson,
RANK() OVER (ORDER BY sales_amount DESC) as overall_rank,
RANK() OVER (PARTITION BY region ORDER BY sales_amount DESC) as region_rank
FROM sales;
-- Use CTE to calculate once
WITH ranked_sales AS (
SELECT
salesperson,
region,
sales_amount,
RANK() OVER (ORDER BY sales_amount DESC) as overall_rank,
RANK() OVER (PARTITION BY region ORDER BY sales_amount DESC) as region_rank
FROM sales
)
SELECT * FROM ranked_sales WHERE overall_rank <= 10 OR region_rank <= 3;
Window Functions vs GROUP BY
-- GROUP BY collapses rows
SELECT region, AVG(sales_amount) as avg_sales
FROM sales
GROUP BY region;
-- Window function preserves all rows
SELECT
salesperson,
region,
sales_amount,
AVG(sales_amount) OVER (PARTITION BY region) as region_avg
FROM sales;
Interview Tips
- Understand that window functions don’t collapse rows like GROUP BY
- Know the difference between RANK(), DENSE_RANK(), and ROW_NUMBER()
- Practice using LAG() and LEAD() for time-series analysis
- Understand frame specifications (ROWS vs RANGE)
- Be familiar with common business use cases like top N per group
Test Your Knowledge
Take a quick quiz to test your understanding of this topic.