How do you implement data warehousing concepts in SQL?
Answer
Data warehousing in SQL involves implementing dimensional modeling concepts, ETL processes, and analytical query patterns to support business intelligence and reporting needs. Key concepts include fact tables, dimension tables, star/snowflake schemas, slowly changing dimensions, and OLAP operations.
Dimensional Modeling
Star Schema Implementation
-- Dimension Tables
CREATE TABLE dim_date (
date_key INT PRIMARY KEY,
date_value DATE NOT NULL,
year INT NOT NULL,
quarter INT NOT NULL,
month INT NOT NULL,
month_name VARCHAR(20) NOT NULL,
day_of_week INT NOT NULL,
day_name VARCHAR(20) NOT NULL,
is_weekend BOOLEAN NOT NULL,
is_holiday BOOLEAN DEFAULT FALSE
);
CREATE TABLE dim_customer (
customer_key INT IDENTITY(1,1) PRIMARY KEY,
customer_id VARCHAR(20) NOT NULL, -- Business key
customer_name VARCHAR(100) NOT NULL,
customer_type VARCHAR(20),
region VARCHAR(50),
country VARCHAR(50),
-- SCD Type 2 columns
effective_date DATE NOT NULL,
expiry_date DATE DEFAULT '9999-12-31',
is_current BOOLEAN DEFAULT TRUE,
-- Metadata
created_date DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_date DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE dim_product (
product_key INT IDENTITY(1,1) PRIMARY KEY,
product_id VARCHAR(20) NOT NULL,
product_name VARCHAR(100) NOT NULL,
category VARCHAR(50),
subcategory VARCHAR(50),
brand VARCHAR(50),
unit_cost DECIMAL(10,2),
-- SCD columns
effective_date DATE NOT NULL,
expiry_date DATE DEFAULT '9999-12-31',
is_current BOOLEAN DEFAULT TRUE
);
-- Fact Table
CREATE TABLE fact_sales (
sales_key BIGINT IDENTITY(1,1) PRIMARY KEY,
-- Foreign keys to dimensions
date_key INT NOT NULL REFERENCES dim_date(date_key),
customer_key INT NOT NULL REFERENCES dim_customer(customer_key),
product_key INT NOT NULL REFERENCES dim_product(product_key),
-- Measures
quantity INT NOT NULL,
unit_price DECIMAL(10,2) NOT NULL,
discount_amount DECIMAL(10,2) DEFAULT 0,
tax_amount DECIMAL(10,2) DEFAULT 0,
total_amount DECIMAL(12,2) NOT NULL,
-- Degenerate dimensions
order_number VARCHAR(20),
invoice_number VARCHAR(20),
-- Metadata
created_date DATETIME DEFAULT CURRENT_TIMESTAMP
);
-- Indexes for performance
CREATE INDEX IX_fact_sales_date ON fact_sales (date_key);
CREATE INDEX IX_fact_sales_customer ON fact_sales (customer_key);
CREATE INDEX IX_fact_sales_product ON fact_sales (product_key);
CREATE INDEX IX_fact_sales_composite ON fact_sales (date_key, customer_key, product_key);
Snowflake Schema Implementation
-- Normalized dimension tables
CREATE TABLE dim_product_category (
category_key INT IDENTITY(1,1) PRIMARY KEY,
category_id VARCHAR(10) NOT NULL,
category_name VARCHAR(50) NOT NULL,
department VARCHAR(50)
);
CREATE TABLE dim_product_brand (
brand_key INT IDENTITY(1,1) PRIMARY KEY,
brand_id VARCHAR(10) NOT NULL,
brand_name VARCHAR(50) NOT NULL,
manufacturer VARCHAR(100)
);
-- Modified product dimension (snowflake)
CREATE TABLE dim_product_snowflake (
product_key INT IDENTITY(1,1) PRIMARY KEY,
product_id VARCHAR(20) NOT NULL,
product_name VARCHAR(100) NOT NULL,
-- Foreign keys to other dimension tables
category_key INT REFERENCES dim_product_category(category_key),
brand_key INT REFERENCES dim_product_brand(brand_key),
unit_cost DECIMAL(10,2),
effective_date DATE NOT NULL,
expiry_date DATE DEFAULT '9999-12-31',
is_current BOOLEAN DEFAULT TRUE
);
Slowly Changing Dimensions (SCD)
SCD Type 1 - Overwrite
-- Simple update overwrites old values
CREATE OR REPLACE FUNCTION update_customer_scd1(
p_customer_id VARCHAR(20),
p_customer_name VARCHAR(100),
p_region VARCHAR(50)
) RETURNS VOID AS $$
BEGIN
UPDATE dim_customer
SET customer_name = p_customer_name,
region = p_region,
updated_date = CURRENT_TIMESTAMP
WHERE customer_id = p_customer_id
AND is_current = TRUE;
END;
$$ LANGUAGE plpgsql;
SCD Type 2 - Add New Record
-- Maintain history by adding new records
CREATE OR REPLACE FUNCTION update_customer_scd2(
p_customer_id VARCHAR(20),
p_customer_name VARCHAR(100),
p_customer_type VARCHAR(20),
p_region VARCHAR(50)
) RETURNS VOID AS $$
DECLARE
current_record RECORD;
BEGIN
-- Get current record
SELECT * INTO current_record
FROM dim_customer
WHERE customer_id = p_customer_id
AND is_current = TRUE;
-- Check if data has changed
IF current_record.customer_name <> p_customer_name
OR current_record.customer_type <> p_customer_type
OR current_record.region <> p_region THEN
-- Expire current record
UPDATE dim_customer
SET expiry_date = CURRENT_DATE - 1,
is_current = FALSE,
updated_date = CURRENT_TIMESTAMP
WHERE customer_key = current_record.customer_key;
-- Insert new record
INSERT INTO dim_customer (
customer_id, customer_name, customer_type, region,
effective_date, expiry_date, is_current
) VALUES (
p_customer_id, p_customer_name, p_customer_type, p_region,
CURRENT_DATE, '9999-12-31', TRUE
);
END IF;
END;
$$ LANGUAGE plpgsql;
SCD Type 3 - Add New Column
-- Track previous value in separate column
ALTER TABLE dim_customer
ADD COLUMN previous_region VARCHAR(50),
ADD COLUMN region_change_date DATE;
CREATE OR REPLACE FUNCTION update_customer_scd3(
p_customer_id VARCHAR(20),
p_new_region VARCHAR(50)
) RETURNS VOID AS $$
BEGIN
UPDATE dim_customer
SET previous_region = region,
region = p_new_region,
region_change_date = CURRENT_DATE,
updated_date = CURRENT_TIMESTAMP
WHERE customer_id = p_customer_id
AND is_current = TRUE
AND region <> p_new_region;
END;
$$ LANGUAGE plpgsql;
ETL Implementation
Extract, Transform, Load Process
-- Staging tables for ETL
CREATE TABLE staging_sales (
transaction_id VARCHAR(50),
transaction_date DATE,
customer_id VARCHAR(20),
product_id VARCHAR(20),
quantity INT,
unit_price DECIMAL(10,2),
discount_percent DECIMAL(5,2),
-- ETL metadata
batch_id INT,
load_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
processed BOOLEAN DEFAULT FALSE
);
-- ETL procedure
CREATE OR REPLACE FUNCTION etl_load_sales_fact(p_batch_id INT)
RETURNS TABLE(processed_rows INT, error_rows INT) AS $$
DECLARE
v_processed_rows INT := 0;
v_error_rows INT := 0;
staging_record RECORD;
BEGIN
-- Process each staging record
FOR staging_record IN
SELECT * FROM staging_sales
WHERE batch_id = p_batch_id AND NOT processed
LOOP
BEGIN
-- Transform and load
INSERT INTO fact_sales (
date_key,
customer_key,
product_key,
quantity,
unit_price,
discount_amount,
total_amount,
order_number
)
SELECT
d.date_key,
c.customer_key,
p.product_key,
s.quantity,
s.unit_price,
s.unit_price * s.quantity * s.discount_percent / 100,
s.unit_price * s.quantity * (1 - s.discount_percent / 100),
s.transaction_id
FROM staging_sales s
JOIN dim_date d ON d.date_value = s.transaction_date
JOIN dim_customer c ON c.customer_id = s.customer_id AND c.is_current = TRUE
JOIN dim_product p ON p.product_id = s.product_id AND p.is_current = TRUE
WHERE s.transaction_id = staging_record.transaction_id;
-- Mark as processed
UPDATE staging_sales
SET processed = TRUE
WHERE transaction_id = staging_record.transaction_id;
v_processed_rows := v_processed_rows + 1;
EXCEPTION WHEN OTHERS THEN
-- Log error and continue
INSERT INTO etl_error_log (
batch_id, transaction_id, error_message, error_timestamp
) VALUES (
p_batch_id, staging_record.transaction_id, SQLERRM, CURRENT_TIMESTAMP
);
v_error_rows := v_error_rows + 1;
END;
END LOOP;
RETURN QUERY SELECT v_processed_rows, v_error_rows;
END;
$$ LANGUAGE plpgsql;
Data Quality and Validation
-- Data quality checks
CREATE TABLE data_quality_rules (
rule_id SERIAL PRIMARY KEY,
rule_name VARCHAR(100),
table_name VARCHAR(50),
column_name VARCHAR(50),
rule_type VARCHAR(20), -- NOT_NULL, RANGE, FORMAT, REFERENCE
rule_definition TEXT,
is_active BOOLEAN DEFAULT TRUE
);
-- Data quality validation function
CREATE OR REPLACE FUNCTION validate_data_quality(p_batch_id INT)
RETURNS TABLE(rule_name VARCHAR, failed_count INT) AS $$
BEGIN
-- Check for null values in required fields
RETURN QUERY
SELECT
'Customer ID Not Null'::VARCHAR,
COUNT(*)::INT
FROM staging_sales
WHERE batch_id = p_batch_id
AND customer_id IS NULL;
-- Check for valid date ranges
RETURN QUERY
SELECT
'Valid Transaction Date'::VARCHAR,
COUNT(*)::INT
FROM staging_sales
WHERE batch_id = p_batch_id
AND (transaction_date < '2020-01-01' OR transaction_date > CURRENT_DATE);
-- Check for positive quantities and prices
RETURN QUERY
SELECT
'Positive Quantity and Price'::VARCHAR,
COUNT(*)::INT
FROM staging_sales
WHERE batch_id = p_batch_id
AND (quantity <= 0 OR unit_price <= 0);
-- Check referential integrity
RETURN QUERY
SELECT
'Valid Customer Reference'::VARCHAR,
COUNT(*)::INT
FROM staging_sales s
LEFT JOIN dim_customer c ON s.customer_id = c.customer_id AND c.is_current = TRUE
WHERE s.batch_id = p_batch_id
AND c.customer_key IS NULL;
END;
$$ LANGUAGE plpgsql;
OLAP Operations
Cube and Rollup Operations
-- Sales cube with multiple dimensions
SELECT
COALESCE(dd.year::VARCHAR, 'All Years') AS year,
COALESCE(dc.region, 'All Regions') AS region,
COALESCE(dp.category, 'All Categories') AS category,
SUM(fs.total_amount) AS total_sales,
SUM(fs.quantity) AS total_quantity,
COUNT(*) AS transaction_count
FROM fact_sales fs
JOIN dim_date dd ON fs.date_key = dd.date_key
JOIN dim_customer dc ON fs.customer_key = dc.customer_key
JOIN dim_product dp ON fs.product_key = dp.product_key
WHERE dd.year >= 2023
GROUP BY ROLLUP (dd.year, dc.region, dp.category)
ORDER BY year, region, category;
-- Cube operation for all combinations
SELECT
COALESCE(dd.quarter::VARCHAR, 'All Quarters') AS quarter,
COALESCE(dc.customer_type, 'All Types') AS customer_type,
COALESCE(dp.brand, 'All Brands') AS brand,
SUM(fs.total_amount) AS total_sales,
AVG(fs.unit_price) AS avg_unit_price
FROM fact_sales fs
JOIN dim_date dd ON fs.date_key = dd.date_key
JOIN dim_customer dc ON fs.customer_key = dc.customer_key
JOIN dim_product dp ON fs.product_key = dp.product_key
WHERE dd.year = 2024
GROUP BY CUBE (dd.quarter, dc.customer_type, dp.brand)
ORDER BY quarter, customer_type, brand;
Window Functions for Analytics
-- Running totals and moving averages
SELECT
dd.date_value,
dc.region,
SUM(fs.total_amount) AS daily_sales,
SUM(SUM(fs.total_amount)) OVER (
PARTITION BY dc.region
ORDER BY dd.date_value
ROWS UNBOUNDED PRECEDING
) AS running_total,
AVG(SUM(fs.total_amount)) OVER (
PARTITION BY dc.region
ORDER BY dd.date_value
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS moving_avg_7_days,
RANK() OVER (
PARTITION BY dd.year, dd.month
ORDER BY SUM(fs.total_amount) DESC
) AS monthly_rank
FROM fact_sales fs
JOIN dim_date dd ON fs.date_key = dd.date_key
JOIN dim_customer dc ON fs.customer_key = dc.customer_key
GROUP BY dd.date_value, dd.year, dd.month, dc.region
ORDER BY dc.region, dd.date_value;
Advanced Analytics
Time Intelligence Functions
-- Year-over-year comparison
WITH monthly_sales AS (
SELECT
dd.year,
dd.month,
dc.region,
SUM(fs.total_amount) AS monthly_total
FROM fact_sales fs
JOIN dim_date dd ON fs.date_key = dd.date_key
JOIN dim_customer dc ON fs.customer_key = dc.customer_key
GROUP BY dd.year, dd.month, dc.region
)
SELECT
current_year.year,
current_year.month,
current_year.region,
current_year.monthly_total,
previous_year.monthly_total AS previous_year_total,
current_year.monthly_total - previous_year.monthly_total AS yoy_difference,
ROUND(
(current_year.monthly_total - previous_year.monthly_total) * 100.0 /
previous_year.monthly_total, 2
) AS yoy_growth_percent
FROM monthly_sales current_year
LEFT JOIN monthly_sales previous_year
ON current_year.month = previous_year.month
AND current_year.region = previous_year.region
AND current_year.year = previous_year.year + 1
WHERE current_year.year = 2024
ORDER BY current_year.region, current_year.month;
Cohort Analysis
-- Customer cohort analysis
WITH customer_cohorts AS (
SELECT
dc.customer_id,
MIN(dd.date_value) AS first_purchase_date,
DATE_TRUNC('month', MIN(dd.date_value)) AS cohort_month
FROM fact_sales fs
JOIN dim_date dd ON fs.date_key = dd.date_key
JOIN dim_customer dc ON fs.customer_key = dc.customer_key
GROUP BY dc.customer_id
),
cohort_data AS (
SELECT
cc.cohort_month,
DATE_TRUNC('month', dd.date_value) AS purchase_month,
EXTRACT(YEAR FROM AGE(dd.date_value, cc.first_purchase_date)) * 12 +
EXTRACT(MONTH FROM AGE(dd.date_value, cc.first_purchase_date)) AS period_number,
COUNT(DISTINCT dc.customer_id) AS customers
FROM fact_sales fs
JOIN dim_date dd ON fs.date_key = dd.date_key
JOIN dim_customer dc ON fs.customer_key = dc.customer_key
JOIN customer_cohorts cc ON dc.customer_id = cc.customer_id
GROUP BY cc.cohort_month, purchase_month, period_number
)
SELECT
cohort_month,
period_number,
customers,
FIRST_VALUE(customers) OVER (
PARTITION BY cohort_month
ORDER BY period_number
) AS cohort_size,
ROUND(
customers * 100.0 / FIRST_VALUE(customers) OVER (
PARTITION BY cohort_month
ORDER BY period_number
), 2
) AS retention_rate
FROM cohort_data
ORDER BY cohort_month, period_number;
Performance Optimization
Aggregate Tables and Materialized Views
-- Pre-aggregated summary tables
CREATE TABLE agg_sales_monthly (
year INT,
month INT,
region VARCHAR(50),
category VARCHAR(50),
total_sales DECIMAL(15,2),
total_quantity INT,
unique_customers INT,
avg_order_value DECIMAL(10,2),
PRIMARY KEY (year, month, region, category)
);
-- Materialized view for daily aggregates
CREATE MATERIALIZED VIEW mv_sales_daily AS
SELECT
dd.date_value,
dd.year,
dd.month,
dd.day_of_week,
dc.region,
dc.customer_type,
dp.category,
COUNT(*) AS transaction_count,
SUM(fs.total_amount) AS total_sales,
SUM(fs.quantity) AS total_quantity,
COUNT(DISTINCT fs.customer_key) AS unique_customers,
AVG(fs.total_amount) AS avg_transaction_value
FROM fact_sales fs
JOIN dim_date dd ON fs.date_key = dd.date_key
JOIN dim_customer dc ON fs.customer_key = dc.customer_key
JOIN dim_product dp ON fs.product_key = dp.product_key
GROUP BY
dd.date_value, dd.year, dd.month, dd.day_of_week,
dc.region, dc.customer_type, dp.category;
-- Indexes on materialized view
CREATE INDEX idx_mv_sales_daily_date ON mv_sales_daily (date_value);
CREATE INDEX idx_mv_sales_daily_region ON mv_sales_daily (region);
CREATE INDEX idx_mv_sales_daily_category ON mv_sales_daily (category);
Partitioning Strategy
-- Partition fact table by date
CREATE TABLE fact_sales_partitioned (
LIKE fact_sales INCLUDING ALL
) PARTITION BY RANGE (date_key);
-- Create monthly partitions
CREATE TABLE fact_sales_2024_01 PARTITION OF fact_sales_partitioned
FOR VALUES FROM (20240101) TO (20240201);
CREATE TABLE fact_sales_2024_02 PARTITION OF fact_sales_partitioned
FOR VALUES FROM (20240201) TO (20240301);
-- Automated partition creation
CREATE OR REPLACE FUNCTION create_monthly_partition(
table_name TEXT,
year_month DATE
) RETURNS VOID AS $$
DECLARE
partition_name TEXT;
start_date INT;
end_date INT;
BEGIN
partition_name := table_name || '_' || to_char(year_month, 'YYYY_MM');
start_date := to_char(year_month, 'YYYYMMDD')::INT;
end_date := to_char(year_month + INTERVAL '1 month', 'YYYYMMDD')::INT;
EXECUTE format('CREATE TABLE %I PARTITION OF %I FOR VALUES FROM (%s) TO (%s)',
partition_name, table_name, start_date, end_date);
END;
$$ LANGUAGE plpgsql;
Data Warehouse Monitoring
Performance Metrics
-- Query performance monitoring
CREATE TABLE dw_query_stats (
query_id SERIAL PRIMARY KEY,
query_text TEXT,
execution_time_ms INT,
rows_examined BIGINT,
rows_returned BIGINT,
tables_accessed TEXT[],
execution_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- ETL monitoring
CREATE TABLE etl_batch_log (
batch_id SERIAL PRIMARY KEY,
batch_name VARCHAR(100),
start_time TIMESTAMP,
end_time TIMESTAMP,
records_processed INT,
records_failed INT,
status VARCHAR(20), -- RUNNING, SUCCESS, FAILED
error_message TEXT
);
-- Data freshness monitoring
CREATE VIEW data_freshness AS
SELECT
'fact_sales' AS table_name,
MAX(created_date) AS last_update,
COUNT(*) AS record_count,
EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - MAX(created_date))) / 3600 AS hours_since_update
FROM fact_sales
UNION ALL
SELECT
'dim_customer' AS table_name,
MAX(updated_date) AS last_update,
COUNT(*) AS record_count,
EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - MAX(updated_date))) / 3600 AS hours_since_update
FROM dim_customer;
Best Practices
Design Guidelines
- Use surrogate keys for dimension tables
- Implement proper SCD strategies based on business requirements
- Design fact tables with appropriate grain (level of detail)
- Use conformed dimensions across multiple fact tables
- Implement comprehensive data quality checks
- Plan for scalability with partitioning and indexing strategies
ETL Best Practices
- Implement incremental loading where possible
- Use staging areas for data validation
- Maintain comprehensive error logging
- Design for restartability and recovery
- Monitor data lineage and dependencies
- Implement proper change data capture (CDC)
Interview Tips
- Understand dimensional modeling concepts (facts, dimensions, star/snowflake schemas)
- Know different SCD types and when to use each
- Be familiar with ETL processes and data quality considerations
- Understand OLAP operations (rollup, cube, drill-down)
- Know performance optimization techniques for data warehouses
- Practice writing analytical queries with window functions
- Be aware of modern alternatives (data lakes, cloud data warehouses)
- Understand the differences between OLTP and OLAP systems
Test Your Knowledge
Take a quick quiz to test your understanding of this topic.