What are materialized views and when should you use them?

Answer

Materialized views are database objects that store the result of a query physically on disk, unlike regular views which are virtual. They provide pre-computed results for complex queries, significantly improving performance at the cost of storage space and data freshness.

Materialized Views vs Regular Views

Regular Views

-- Regular view (virtual)
CREATE VIEW customer_summary AS
SELECT 
    c.customer_id,
    c.customer_name,
    COUNT(o.order_id) as total_orders,
    SUM(o.order_total) as total_spent,
    AVG(o.order_total) as avg_order_value
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.customer_name;

-- Query executes underlying SQL every time
SELECT * FROM customer_summary WHERE total_spent > 1000;

Materialized Views

-- PostgreSQL materialized view
CREATE MATERIALIZED VIEW customer_summary_mv AS
SELECT 
    c.customer_id,
    c.customer_name,
    COUNT(o.order_id) as total_orders,
    SUM(o.order_total) as total_spent,
    AVG(o.order_total) as avg_order_value,
    MAX(o.order_date) as last_order_date
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.customer_name;

-- Create index for better performance
CREATE INDEX idx_customer_summary_mv_spent 
ON customer_summary_mv (total_spent);

-- Query reads pre-computed results (fast)
SELECT * FROM customer_summary_mv WHERE total_spent > 1000;

Creating Materialized Views

PostgreSQL Implementation

-- Basic materialized view
CREATE MATERIALIZED VIEW sales_monthly_summary AS
SELECT 
    DATE_TRUNC('month', order_date) as month,
    COUNT(*) as order_count,
    SUM(order_total) as total_revenue,
    AVG(order_total) as avg_order_value,
    COUNT(DISTINCT customer_id) as unique_customers
FROM orders
WHERE order_date >= '2023-01-01'
GROUP BY DATE_TRUNC('month', order_date);

-- With additional options
CREATE MATERIALIZED VIEW product_performance_mv
TABLESPACE fast_ssd  -- Specify tablespace
AS
SELECT 
    p.product_id,
    p.product_name,
    p.category,
    COUNT(oi.order_id) as times_ordered,
    SUM(oi.quantity) as total_quantity_sold,
    SUM(oi.quantity * oi.unit_price) as total_revenue,
    AVG(oi.unit_price) as avg_selling_price
FROM products p
LEFT JOIN order_items oi ON p.product_id = oi.product_id
LEFT JOIN orders o ON oi.order_id = o.order_id
WHERE o.order_date >= '2023-01-01'
GROUP BY p.product_id, p.product_name, p.category
WITH NO DATA;  -- Create structure without data

-- Populate the materialized view
REFRESH MATERIALIZED VIEW product_performance_mv;

Oracle Implementation

-- Oracle materialized view with automatic refresh
CREATE MATERIALIZED VIEW customer_sales_mv
BUILD IMMEDIATE
REFRESH COMPLETE ON DEMAND
ENABLE QUERY REWRITE
AS
SELECT 
    c.customer_id,
    c.customer_name,
    c.region,
    SUM(o.order_total) as total_sales,
    COUNT(o.order_id) as order_count,
    MAX(o.order_date) as last_order_date
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.customer_name, c.region;

-- Fast refresh materialized view (requires materialized view log)
CREATE MATERIALIZED VIEW LOG ON orders
WITH ROWID, SEQUENCE (customer_id, order_total, order_date)
INCLUDING NEW VALUES;

CREATE MATERIALIZED VIEW fast_refresh_mv
REFRESH FAST ON COMMIT
AS
SELECT 
    customer_id,
    SUM(order_total) as total_sales,
    COUNT(*) as order_count
FROM orders
GROUP BY customer_id;

SQL Server Indexed Views

-- SQL Server indexed view (similar to materialized view)
CREATE VIEW customer_totals_iv
WITH SCHEMABINDING
AS
SELECT 
    c.customer_id,
    COUNT_BIG(*) as order_count,
    SUM(o.order_total) as total_sales
FROM dbo.customers c
JOIN dbo.orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id;

-- Create unique clustered index (makes it "materialized")
CREATE UNIQUE CLUSTERED INDEX IX_customer_totals_iv
ON customer_totals_iv (customer_id);

-- Additional non-clustered indexes
CREATE NONCLUSTERED INDEX IX_customer_totals_sales
ON customer_totals_iv (total_sales DESC);

Refresh Strategies

Manual Refresh

-- PostgreSQL: Complete refresh
REFRESH MATERIALIZED VIEW customer_summary_mv;

-- Concurrent refresh (non-blocking)
REFRESH MATERIALIZED VIEW CONCURRENTLY customer_summary_mv;

-- Oracle: Manual refresh
EXEC DBMS_MVIEW.REFRESH('customer_sales_mv', 'C');  -- Complete
EXEC DBMS_MVIEW.REFRESH('customer_sales_mv', 'F');  -- Fast

Scheduled Refresh

-- PostgreSQL: Using pg_cron extension
SELECT cron.schedule('refresh-customer-summary', '0 2 * * *', 
    'REFRESH MATERIALIZED VIEW customer_summary_mv;');

-- Oracle: Using DBMS_SCHEDULER
BEGIN
    DBMS_SCHEDULER.CREATE_JOB(
        job_name => 'REFRESH_CUSTOMER_MV',
        job_type => 'PLSQL_BLOCK',
        job_action => 'BEGIN DBMS_MVIEW.REFRESH(''customer_sales_mv''); END;',
        start_date => SYSTIMESTAMP,
        repeat_interval => 'FREQ=DAILY;BYHOUR=2',
        enabled => TRUE
    );
END;

Incremental Refresh

-- Oracle fast refresh setup
CREATE MATERIALIZED VIEW LOG ON orders
WITH ROWID, SEQUENCE (customer_id, order_total, order_date)
INCLUDING NEW VALUES;

CREATE MATERIALIZED VIEW LOG ON customers
WITH ROWID, SEQUENCE (customer_id, customer_name)
INCLUDING NEW VALUES;

-- Materialized view with fast refresh capability
CREATE MATERIALIZED VIEW customer_sales_fast_mv
REFRESH FAST ON DEMAND
AS
SELECT 
    c.customer_id,
    c.customer_name,
    SUM(o.order_total) as total_sales,
    COUNT(o.order_id) as order_count
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.customer_name;

Advanced Use Cases

Data Warehouse Aggregations

-- Multi-dimensional sales cube
CREATE MATERIALIZED VIEW sales_cube_mv AS
SELECT 
    DATE_TRUNC('month', o.order_date) as month,
    c.region,
    p.category,
    COUNT(*) as order_count,
    SUM(oi.quantity) as total_quantity,
    SUM(oi.quantity * oi.unit_price) as total_revenue,
    AVG(oi.unit_price) as avg_price
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
WHERE o.order_date >= '2023-01-01'
GROUP BY 
    DATE_TRUNC('month', o.order_date),
    c.region,
    p.category;

-- Create indexes for different query patterns
CREATE INDEX idx_sales_cube_month ON sales_cube_mv (month);
CREATE INDEX idx_sales_cube_region ON sales_cube_mv (region);
CREATE INDEX idx_sales_cube_category ON sales_cube_mv (category);
CREATE INDEX idx_sales_cube_revenue ON sales_cube_mv (total_revenue DESC);

Real-time Analytics Dashboard

-- Dashboard metrics materialized view
CREATE MATERIALIZED VIEW dashboard_metrics_mv AS
WITH daily_stats AS (
    SELECT 
        DATE(order_date) as date,
        COUNT(*) as daily_orders,
        SUM(order_total) as daily_revenue,
        COUNT(DISTINCT customer_id) as daily_customers
    FROM orders
    WHERE order_date >= CURRENT_DATE - INTERVAL '30 days'
    GROUP BY DATE(order_date)
),
product_stats AS (
    SELECT 
        p.product_name,
        SUM(oi.quantity) as quantity_sold,
        ROW_NUMBER() OVER (ORDER BY SUM(oi.quantity) DESC) as rank
    FROM products p
    JOIN order_items oi ON p.product_id = oi.product_id
    JOIN orders o ON oi.order_id = o.order_id
    WHERE o.order_date >= CURRENT_DATE - INTERVAL '7 days'
    GROUP BY p.product_id, p.product_name
)
SELECT 
    'summary' as metric_type,
    json_build_object(
        'total_orders', (SELECT SUM(daily_orders) FROM daily_stats),
        'total_revenue', (SELECT SUM(daily_revenue) FROM daily_stats),
        'avg_daily_orders', (SELECT AVG(daily_orders) FROM daily_stats),
        'top_products', (SELECT json_agg(product_name ORDER BY rank) 
                        FROM product_stats WHERE rank <= 5)
    ) as metrics
UNION ALL
SELECT 
    'daily_trend' as metric_type,
    json_agg(json_build_object(
        'date', date,
        'orders', daily_orders,
        'revenue', daily_revenue,
        'customers', daily_customers
    ) ORDER BY date) as metrics
FROM daily_stats;

Complex Reporting Views

-- Customer lifecycle analysis
CREATE MATERIALIZED VIEW customer_lifecycle_mv AS
WITH customer_metrics AS (
    SELECT 
        c.customer_id,
        c.customer_name,
        c.registration_date,
        MIN(o.order_date) as first_order_date,
        MAX(o.order_date) as last_order_date,
        COUNT(o.order_id) as total_orders,
        SUM(o.order_total) as total_spent,
        AVG(o.order_total) as avg_order_value
    FROM customers c
    LEFT JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.customer_id, c.customer_name, c.registration_date
)
SELECT 
    customer_id,
    customer_name,
    registration_date,
    first_order_date,
    last_order_date,
    total_orders,
    total_spent,
    avg_order_value,
    CASE 
        WHEN total_orders = 0 THEN 'Never Purchased'
        WHEN last_order_date >= CURRENT_DATE - INTERVAL '30 days' THEN 'Active'
        WHEN last_order_date >= CURRENT_DATE - INTERVAL '90 days' THEN 'At Risk'
        WHEN last_order_date >= CURRENT_DATE - INTERVAL '180 days' THEN 'Dormant'
        ELSE 'Lost'
    END as lifecycle_stage,
    CASE 
        WHEN total_spent > 5000 THEN 'VIP'
        WHEN total_spent > 2000 THEN 'Premium'
        WHEN total_spent > 500 THEN 'Regular'
        WHEN total_orders > 0 THEN 'Occasional'
        ELSE 'Prospect'
    END as customer_tier,
    EXTRACT(DAYS FROM (last_order_date - first_order_date)) as customer_lifespan_days
FROM customer_metrics;

Performance Optimization

Indexing Strategies

-- Create appropriate indexes on materialized views
CREATE INDEX idx_customer_lifecycle_stage 
ON customer_lifecycle_mv (lifecycle_stage);

CREATE INDEX idx_customer_lifecycle_tier 
ON customer_lifecycle_mv (customer_tier);

CREATE INDEX idx_customer_lifecycle_spent 
ON customer_lifecycle_mv (total_spent DESC);

-- Composite indexes for common query patterns
CREATE INDEX idx_customer_lifecycle_stage_tier 
ON customer_lifecycle_mv (lifecycle_stage, customer_tier);

-- Partial indexes for specific conditions
CREATE INDEX idx_customer_lifecycle_active 
ON customer_lifecycle_mv (total_spent) 
WHERE lifecycle_stage = 'Active';

Query Rewriting

-- Original expensive query
SELECT 
    lifecycle_stage,
    COUNT(*) as customer_count,
    AVG(total_spent) as avg_spent,
    SUM(total_spent) as total_revenue
FROM (
    -- Complex subquery with multiple joins and calculations
    SELECT c.customer_id, /* ... complex logic ... */
    FROM customers c
    LEFT JOIN orders o ON c.customer_id = o.customer_id
    -- ... more joins and calculations
) customer_analysis
GROUP BY lifecycle_stage;

-- Optimized query using materialized view
SELECT 
    lifecycle_stage,
    COUNT(*) as customer_count,
    AVG(total_spent) as avg_spent,
    SUM(total_spent) as total_revenue
FROM customer_lifecycle_mv
GROUP BY lifecycle_stage;

Partitioned Materialized Views

-- Partitioned materialized view for large datasets
CREATE MATERIALIZED VIEW sales_summary_partitioned
PARTITION BY RANGE (month)
AS
SELECT 
    DATE_TRUNC('month', order_date) as month,
    region,
    SUM(order_total) as total_sales,
    COUNT(*) as order_count
FROM orders
GROUP BY DATE_TRUNC('month', order_date), region;

-- Create partitions
CREATE TABLE sales_summary_2024_q1 PARTITION OF sales_summary_partitioned
FOR VALUES FROM ('2024-01-01') TO ('2024-04-01');

CREATE TABLE sales_summary_2024_q2 PARTITION OF sales_summary_partitioned
FOR VALUES FROM ('2024-04-01') TO ('2024-07-01');

Monitoring and Maintenance

Materialized View Statistics

-- PostgreSQL: Monitor materialized view usage
SELECT 
    schemaname,
    matviewname,
    pg_size_pretty(pg_total_relation_size(schemaname||'.'||matviewname)) as size,
    ispopulated
FROM pg_matviews;

-- Check last refresh time (requires custom tracking)
CREATE TABLE mv_refresh_log (
    mv_name TEXT,
    refresh_time TIMESTAMP,
    refresh_duration INTERVAL
);

-- Oracle: Materialized view information
SELECT 
    mview_name,
    last_refresh_date,
    refresh_method,
    build_mode,
    fast_refreshable
FROM user_mviews;

Automated Maintenance

-- PostgreSQL: Automated refresh with logging
CREATE OR REPLACE FUNCTION refresh_materialized_view(mv_name TEXT)
RETURNS VOID AS $$
DECLARE
    start_time TIMESTAMP;
    end_time TIMESTAMP;
BEGIN
    start_time := clock_timestamp();
    
    EXECUTE 'REFRESH MATERIALIZED VIEW CONCURRENTLY ' || mv_name;
    
    end_time := clock_timestamp();
    
    INSERT INTO mv_refresh_log (mv_name, refresh_time, refresh_duration)
    VALUES (mv_name, start_time, end_time - start_time);
    
    RAISE NOTICE 'Refreshed % in %', mv_name, end_time - start_time;
END;
$$ LANGUAGE plpgsql;

-- Schedule regular refreshes
SELECT cron.schedule('refresh-sales-summary', '0 */4 * * *', 
    'SELECT refresh_materialized_view(''sales_monthly_summary'');');

When to Use Materialized Views

Good Use Cases

-- 1. Complex aggregations over large datasets
CREATE MATERIALIZED VIEW sales_analytics_mv AS
SELECT 
    region,
    product_category,
    DATE_TRUNC('quarter', order_date) as quarter,
    SUM(order_total) as total_sales,
    COUNT(DISTINCT customer_id) as unique_customers,
    AVG(order_total) as avg_order_value
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
GROUP BY region, product_category, DATE_TRUNC('quarter', order_date);

-- 2. Frequently accessed reports
CREATE MATERIALIZED VIEW monthly_kpi_report_mv AS
SELECT 
    DATE_TRUNC('month', order_date) as month,
    COUNT(*) as total_orders,
    SUM(order_total) as revenue,
    COUNT(DISTINCT customer_id) as active_customers,
    AVG(order_total) as avg_order_value,
    SUM(order_total) / COUNT(DISTINCT customer_id) as revenue_per_customer
FROM orders
GROUP BY DATE_TRUNC('month', order_date);

-- 3. Data warehouse star schema fact tables
CREATE MATERIALIZED VIEW fact_sales_mv AS
SELECT 
    d.date_key,
    c.customer_key,
    p.product_key,
    l.location_key,
    SUM(oi.quantity) as quantity_sold,
    SUM(oi.quantity * oi.unit_price) as sales_amount,
    COUNT(DISTINCT o.order_id) as transaction_count
FROM orders o
JOIN dim_date d ON DATE(o.order_date) = d.date_value
JOIN dim_customer c ON o.customer_id = c.customer_id
JOIN dim_location l ON o.location_id = l.location_id
JOIN order_items oi ON o.order_id = oi.order_id
JOIN dim_product p ON oi.product_id = p.product_id
GROUP BY d.date_key, c.customer_key, p.product_key, l.location_key;

When NOT to Use

-- Avoid for:
-- 1. Frequently changing data (high refresh overhead)
-- 2. Simple queries that are already fast
-- 3. Small datasets where views are sufficient
-- 4. Real-time requirements (data freshness issues)
-- 5. Limited storage environments

Best Practices

1. Design Considerations

  • Choose appropriate refresh strategy based on data freshness requirements
  • Consider storage costs vs query performance benefits
  • Design for common query patterns
  • Include necessary columns for filtering and sorting

2. Refresh Strategy

  • Use incremental refresh when possible (Oracle fast refresh)
  • Schedule refreshes during low-activity periods
  • Monitor refresh performance and duration
  • Consider concurrent refresh for minimal downtime

3. Indexing

  • Create indexes based on query patterns
  • Use partial indexes for filtered queries
  • Consider composite indexes for multi-column filters
  • Monitor index usage and effectiveness

Interview Tips

  • Understand the trade-offs: performance vs storage vs data freshness
  • Know different refresh strategies and when to use each
  • Be familiar with database-specific implementations
  • Understand when materialized views are beneficial vs regular views
  • Know how to monitor and maintain materialized views
  • Practice designing materialized views for common business scenarios
  • Be aware of limitations and potential issues (stale data, storage costs)

Test Your Knowledge

Take a quick quiz to test your understanding of this topic.