What are subqueries and how do they work?

Answer

A subquery (also called an inner query or nested query) is a query embedded within another SQL statement. The subquery executes first, and its result is used by the outer query.

Basic Syntax

SELECT column1, column2
FROM table1
WHERE column1 = (SELECT column1 FROM table2 WHERE condition);

Types of Subqueries

1. Scalar Subquery

Returns a single value (one row, one column).

-- Find employees with salary higher than average
SELECT name, salary
FROM employees
WHERE salary > (SELECT AVG(salary) FROM employees);

-- Get employee with highest salary in each department
SELECT name, department, salary
FROM employees e1
WHERE salary = (
    SELECT MAX(salary) 
    FROM employees e2 
    WHERE e2.department = e1.department
);

2. Row Subquery

Returns a single row with multiple columns.

-- Find employee with same department and salary as John Doe
SELECT name, department, salary
FROM employees
WHERE (department, salary) = (
    SELECT department, salary 
    FROM employees 
    WHERE name = 'John Doe'
);

3. Table Subquery

Returns multiple rows and columns.

-- Find employees in departments with more than 5 employees
SELECT name, department
FROM employees
WHERE department IN (
    SELECT department
    FROM employees
    GROUP BY department
    HAVING COUNT(*) > 5
);

Subquery Locations

1. WHERE Clause

-- Employees earning more than department average
SELECT name, salary, department
FROM employees e1
WHERE salary > (
    SELECT AVG(salary)
    FROM employees e2
    WHERE e2.department = e1.department
);

2. FROM Clause (Derived Table)

-- Department statistics
SELECT dept_stats.department, dept_stats.avg_salary
FROM (
    SELECT department, AVG(salary) as avg_salary
    FROM employees
    GROUP BY department
) AS dept_stats
WHERE dept_stats.avg_salary > 50000;

3. SELECT Clause

-- Employee details with department average
SELECT 
    name,
    salary,
    department,
    (SELECT AVG(salary) FROM employees e2 WHERE e2.department = e1.department) as dept_avg
FROM employees e1;

4. HAVING Clause

-- Departments with average salary higher than company average
SELECT department, AVG(salary) as dept_avg
FROM employees
GROUP BY department
HAVING AVG(salary) > (SELECT AVG(salary) FROM employees);

Common Subquery Operators

EXISTS / NOT EXISTS

-- Customers who have placed orders
SELECT customer_name
FROM customers c
WHERE EXISTS (
    SELECT 1 FROM orders o 
    WHERE o.customer_id = c.customer_id
);

-- Customers who haven't placed orders
SELECT customer_name
FROM customers c
WHERE NOT EXISTS (
    SELECT 1 FROM orders o 
    WHERE o.customer_id = c.customer_id
);

IN / NOT IN

-- Products in specific categories
SELECT product_name, price
FROM products
WHERE category_id IN (
    SELECT category_id 
    FROM categories 
    WHERE category_name IN ('Electronics', 'Books')
);

-- Handle NULL values carefully with NOT IN
SELECT product_name
FROM products
WHERE category_id NOT IN (
    SELECT category_id 
    FROM categories 
    WHERE category_id IS NOT NULL  -- Important!
);

ANY / SOME

-- Products more expensive than ANY book
SELECT product_name, price
FROM products
WHERE price > ANY (
    SELECT price 
    FROM products 
    WHERE category = 'Books'
);

ALL

-- Products more expensive than ALL books
SELECT product_name, price
FROM products
WHERE price > ALL (
    SELECT price 
    FROM products 
    WHERE category = 'Books'
);

Practical Examples

E-commerce Scenarios

-- Sample tables
CREATE TABLE customers (
    customer_id INT PRIMARY KEY,
    customer_name VARCHAR(100),
    city VARCHAR(50)
);

CREATE TABLE orders (
    order_id INT PRIMARY KEY,
    customer_id INT,
    order_date DATE,
    total_amount DECIMAL(10,2)
);

CREATE TABLE products (
    product_id INT PRIMARY KEY,
    product_name VARCHAR(100),
    category VARCHAR(50),
    price DECIMAL(10,2)
);

Complex Business Queries

-- 1. Find customers who spent more than average in the last year
SELECT customer_name
FROM customers c
WHERE (
    SELECT SUM(total_amount)
    FROM orders o
    WHERE o.customer_id = c.customer_id
    AND o.order_date >= DATE_SUB(CURDATE(), INTERVAL 1 YEAR)
) > (
    SELECT AVG(yearly_total)
    FROM (
        SELECT SUM(total_amount) as yearly_total
        FROM orders
        WHERE order_date >= DATE_SUB(CURDATE(), INTERVAL 1 YEAR)
        GROUP BY customer_id
    ) as customer_totals
);

-- 2. Products never ordered
SELECT product_name
FROM products p
WHERE NOT EXISTS (
    SELECT 1 
    FROM order_items oi 
    WHERE oi.product_id = p.product_id
);

-- 3. Top 3 customers by order value in each city
SELECT customer_name, city, total_orders
FROM (
    SELECT 
        c.customer_name,
        c.city,
        SUM(o.total_amount) as total_orders,
        ROW_NUMBER() OVER (PARTITION BY c.city ORDER BY SUM(o.total_amount) DESC) as rn
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.customer_id, c.customer_name, c.city
) ranked
WHERE rn <= 3;

Performance Considerations

Correlated vs Non-Correlated Subqueries

-- Non-correlated (executes once)
SELECT name FROM employees
WHERE department_id IN (SELECT id FROM departments WHERE location = 'NYC');

-- Correlated (executes for each outer row)
SELECT name FROM employees e1
WHERE salary > (SELECT AVG(salary) FROM employees e2 WHERE e2.dept_id = e1.dept_id);

Optimization Tips

-- Instead of correlated subquery
SELECT e1.name, e1.salary
FROM employees e1
WHERE e1.salary > (
    SELECT AVG(e2.salary)
    FROM employees e2
    WHERE e2.department = e1.department
);

-- Use window function (often faster)
SELECT name, salary
FROM (
    SELECT 
        name, 
        salary, 
        department,
        AVG(salary) OVER (PARTITION BY department) as dept_avg
    FROM employees
) t
WHERE salary > dept_avg;

Common Pitfalls

1. NULL Handling with NOT IN

-- Dangerous: Returns no results if subquery contains NULL
SELECT * FROM products
WHERE category_id NOT IN (SELECT category_id FROM categories);

-- Safe: Filter out NULLs
SELECT * FROM products
WHERE category_id NOT IN (
    SELECT category_id FROM categories WHERE category_id IS NOT NULL
);

2. Multiple Row Subquery with Scalar Operator

-- Error: Subquery returns multiple rows
SELECT * FROM employees
WHERE salary = (SELECT salary FROM employees WHERE department = 'IT');

-- Correct: Use IN or ANY
SELECT * FROM employees
WHERE salary IN (SELECT salary FROM employees WHERE department = 'IT');

Subquery vs JOIN Performance

-- Subquery approach
SELECT customer_name
FROM customers
WHERE customer_id IN (SELECT customer_id FROM orders);

-- JOIN approach (often faster)
SELECT DISTINCT c.customer_name
FROM customers c
INNER JOIN orders o ON c.customer_id = o.customer_id;

Interview Tips

  • Understand the difference between correlated and non-correlated subqueries
  • Know when to use EXISTS vs IN
  • Be aware of NULL handling issues with NOT IN
  • Practice converting subqueries to JOINs and vice versa
  • Understand performance implications of different subquery types

Test Your Knowledge

Take a quick quiz to test your understanding of this topic.