What are Common Table Expressions (CTEs)?
Answer
Common Table Expressions (CTEs) are temporary named result sets that exist only within the scope of a single SQL statement. They provide a way to write more readable and maintainable complex queries.
Basic Syntax
WITH cte_name (column1, column2, ...) AS (
-- CTE query definition
SELECT column1, column2, ...
FROM table_name
WHERE condition
)
-- Main query using the CTE
SELECT * FROM cte_name WHERE condition;
Simple CTE Example
-- Without CTE (subquery in FROM clause)
SELECT dept_name, avg_salary
FROM (
SELECT department_id, AVG(salary) as avg_salary
FROM employees
GROUP BY department_id
) emp_avg
JOIN departments d ON emp_avg.department_id = d.department_id
WHERE avg_salary > 50000;
-- With CTE (more readable)
WITH department_averages AS (
SELECT department_id, AVG(salary) as avg_salary
FROM employees
GROUP BY department_id
)
SELECT d.dept_name, da.avg_salary
FROM department_averages da
JOIN departments d ON da.department_id = d.department_id
WHERE da.avg_salary > 50000;
Multiple CTEs
WITH
high_earners AS (
SELECT employee_id, name, salary, department_id
FROM employees
WHERE salary > 75000
),
department_stats AS (
SELECT
department_id,
COUNT(*) as total_employees,
AVG(salary) as avg_salary
FROM employees
GROUP BY department_id
)
SELECT
he.name,
he.salary,
ds.avg_salary,
ds.total_employees
FROM high_earners he
JOIN department_stats ds ON he.department_id = ds.department_id;
Recursive CTEs
Purpose: Handle hierarchical or tree-structured data.
Employee Hierarchy Example
-- Sample hierarchical data
CREATE TABLE employees (
employee_id INT PRIMARY KEY,
name VARCHAR(100),
manager_id INT,
salary DECIMAL(10,2)
);
-- Recursive CTE to find all subordinates
WITH RECURSIVE employee_hierarchy AS (
-- Anchor: Start with top-level managers
SELECT
employee_id,
name,
manager_id,
salary,
0 as level,
CAST(name AS VARCHAR(1000)) as hierarchy_path
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- Recursive: Find direct reports
SELECT
e.employee_id,
e.name,
e.manager_id,
e.salary,
eh.level + 1,
CONCAT(eh.hierarchy_path, ' -> ', e.name)
FROM employees e
JOIN employee_hierarchy eh ON e.manager_id = eh.employee_id
)
SELECT
employee_id,
name,
level,
hierarchy_path
FROM employee_hierarchy
ORDER BY level, name;
Organizational Chart
WITH RECURSIVE org_chart AS (
-- Find CEO (no manager)
SELECT
employee_id,
name,
manager_id,
1 as level,
name as path
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- Find all subordinates recursively
SELECT
e.employee_id,
e.name,
e.manager_id,
oc.level + 1,
CONCAT(oc.path, ' > ', e.name)
FROM employees e
JOIN org_chart oc ON e.manager_id = oc.employee_id
WHERE oc.level < 10 -- Prevent infinite recursion
)
SELECT * FROM org_chart ORDER BY level, name;
Practical Business Examples
Sales Analysis
WITH
monthly_sales AS (
SELECT
DATE_FORMAT(order_date, '%Y-%m') as month,
SUM(total_amount) as monthly_total
FROM orders
WHERE order_date >= '2024-01-01'
GROUP BY DATE_FORMAT(order_date, '%Y-%m')
),
sales_with_growth AS (
SELECT
month,
monthly_total,
LAG(monthly_total) OVER (ORDER BY month) as prev_month_total,
monthly_total - LAG(monthly_total) OVER (ORDER BY month) as growth
FROM monthly_sales
)
SELECT
month,
monthly_total,
prev_month_total,
growth,
ROUND((growth / prev_month_total) * 100, 2) as growth_percentage
FROM sales_with_growth
WHERE prev_month_total IS NOT NULL
ORDER BY month;
Customer Segmentation
WITH
customer_metrics AS (
SELECT
c.customer_id,
c.customer_name,
COUNT(o.order_id) as order_count,
SUM(o.total_amount) as total_spent,
AVG(o.total_amount) as avg_order_value,
MAX(o.order_date) as last_order_date
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.customer_name
),
customer_segments AS (
SELECT
*,
CASE
WHEN total_spent > 10000 AND order_count > 20 THEN 'VIP'
WHEN total_spent > 5000 AND order_count > 10 THEN 'Premium'
WHEN total_spent > 1000 AND order_count > 5 THEN 'Regular'
WHEN order_count > 0 THEN 'Occasional'
ELSE 'Inactive'
END as segment
FROM customer_metrics
)
SELECT
segment,
COUNT(*) as customer_count,
AVG(total_spent) as avg_total_spent,
AVG(order_count) as avg_order_count
FROM customer_segments
GROUP BY segment
ORDER BY avg_total_spent DESC;
CTE vs Subqueries vs Temporary Tables
CTE Advantages:
- Readability: More readable than nested subqueries
- Reusability: Can reference the same CTE multiple times
- Recursion: Supports recursive operations
- Maintainability: Easier to debug and modify
CTE vs Subquery
-- Complex subquery (hard to read)
SELECT *
FROM (
SELECT department_id, AVG(salary) as avg_sal
FROM employees
GROUP BY department_id
) dept_avg
WHERE dept_avg.avg_sal > (
SELECT AVG(salary) * 1.1
FROM employees
);
-- CTE version (more readable)
WITH
company_avg AS (
SELECT AVG(salary) * 1.1 as threshold
FROM employees
),
dept_averages AS (
SELECT department_id, AVG(salary) as avg_sal
FROM employees
GROUP BY department_id
)
SELECT *
FROM dept_averages da
CROSS JOIN company_avg ca
WHERE da.avg_sal > ca.threshold;
CTE vs Temporary Table
-- Temporary table approach
CREATE TEMPORARY TABLE temp_dept_stats AS
SELECT department_id, AVG(salary) as avg_salary
FROM employees
GROUP BY department_id;
SELECT * FROM temp_dept_stats WHERE avg_salary > 50000;
DROP TEMPORARY TABLE temp_dept_stats;
-- CTE approach (simpler)
WITH dept_stats AS (
SELECT department_id, AVG(salary) as avg_salary
FROM employees
GROUP BY department_id
)
SELECT * FROM dept_stats WHERE avg_salary > 50000;
Performance Considerations
CTE Materialization
-- CTE may be materialized (stored temporarily)
WITH expensive_cte AS (
SELECT customer_id, SUM(order_amount) as total
FROM large_orders_table
GROUP BY customer_id
)
SELECT c1.customer_id, c1.total
FROM expensive_cte c1
JOIN expensive_cte c2 ON c1.total = c2.total -- CTE used twice
WHERE c1.customer_id != c2.customer_id;
Optimization Tips
-- Use indexes on CTE source tables
WITH recent_orders AS (
SELECT customer_id, order_date, total_amount
FROM orders
WHERE order_date >= '2024-01-01' -- Make sure there's an index on order_date
)
SELECT * FROM recent_orders WHERE customer_id = 12345;
Advanced CTE Patterns
Running Totals
WITH daily_sales AS (
SELECT
order_date,
SUM(total_amount) as daily_total
FROM orders
GROUP BY order_date
),
running_totals AS (
SELECT
order_date,
daily_total,
SUM(daily_total) OVER (ORDER BY order_date) as running_total
FROM daily_sales
)
SELECT * FROM running_totals ORDER BY order_date;
Data Validation
WITH data_quality_checks AS (
SELECT
'Missing emails' as check_name,
COUNT(*) as issue_count
FROM customers
WHERE email IS NULL OR email = ''
UNION ALL
SELECT
'Negative prices' as check_name,
COUNT(*) as issue_count
FROM products
WHERE price < 0
UNION ALL
SELECT
'Future order dates' as check_name,
COUNT(*) as issue_count
FROM orders
WHERE order_date > CURRENT_DATE
)
SELECT * FROM data_quality_checks WHERE issue_count > 0;
Interview Tips
- Understand that CTEs improve query readability and maintainability
- Know the difference between regular and recursive CTEs
- Practice writing hierarchical queries with recursive CTEs
- Understand when to use CTEs vs subqueries vs temporary tables
- Be familiar with CTE performance characteristics and limitations
Test Your Knowledge
Take a quick quiz to test your understanding of this topic.