Query Optimization

MongoDB Query Optimization

Use Indexes

// Without index - Collection scan
db.users.find({ email: "john@example.com" });
// Examines: 1,000,000 documents

// With index - Index scan
db.users.createIndex({ email: 1 });
db.users.find({ email: "john@example.com" });
// Examines: 1 document

// Compound index for multiple fields
db.users.createIndex({ status: 1, createdAt: -1 });
db.users.find({ status: "active" }).sort({ createdAt: -1 });

Analyze with explain()

// Execution stats
const explain = db.users.find({ email: "john@example.com" })
  .explain("executionStats");

console.log({
  executionTimeMillis: explain.executionStats.executionTimeMillis,
  totalDocsExamined: explain.executionStats.totalDocsExamined,
  totalKeysExamined: explain.executionStats.totalKeysExamined,
  indexUsed: explain.executionStats.executionStages.indexName
});

// Good performance indicators:
// - totalDocsExamined close to nReturned
// - Uses index (IXSCAN vs COLLSCAN)
// - Low executionTimeMillis

Covered Queries

// Covered query - No document fetch needed
db.users.createIndex({ email: 1, name: 1 });

db.users.find(
  { email: "john@example.com" },
  { _id: 0, email: 1, name: 1 }  // Only indexed fields
);

// Verify covered
const explain = db.users.find(
  { email: "john@example.com" },
  { _id: 0, email: 1, name: 1 }
).explain("executionStats");

console.log(explain.executionStats.totalDocsExamined);  // Should be 0

Projection

// ❌ Bad - Returns all fields
db.users.find({ status: "active" });

// ✅ Good - Returns only needed fields
db.users.find(
  { status: "active" },
  { name: 1, email: 1, _id: 0 }
);

// Exclude large fields
db.users.find(
  { status: "active" },
  { largeField: 0 }
);

Limit Results

// ❌ Bad - Returns all matching documents
db.users.find({ status: "active" });

// ✅ Good - Limit results
db.users.find({ status: "active" }).limit(10);

// Pagination
const page = 2;
const limit = 10;
db.users.find({ status: "active" })
  .skip((page - 1) * limit)
  .limit(limit);

Aggregation Optimization

// ❌ Bad - Filter after grouping
db.orders.aggregate([
  { $group: { _id: "$userId", total: { $sum: "$amount" } } },
  { $match: { total: { $gte: 1000 } } }
]);

// ✅ Good - Filter early
db.orders.aggregate([
  { $match: { status: "completed" } },  // Filter first
  { $project: { userId: 1, amount: 1 } },  // Limit fields
  { $group: { _id: "$userId", total: { $sum: "$amount" } } },
  { $match: { total: { $gte: 1000 } } }
]);

Cassandra Query Optimization

Partition Key Queries

// ✅ Good - Query by partition key
await client.execute(
  'SELECT * FROM users WHERE user_id = ?',
  [userId],
  { prepare: true }
);

// ❌ Bad - Query without partition key (full scan)
await client.execute(
  'SELECT * FROM users WHERE email = ?',
  [email],
  { prepare: true }
);

Clustering Key Ordering

-- Table with clustering key
CREATE TABLE user_events (
  user_id UUID,
  timestamp TIMESTAMP,
  event_type TEXT,
  PRIMARY KEY (user_id, timestamp)
) WITH CLUSTERING ORDER BY (timestamp DESC);

-- Efficient query (uses clustering order)
SELECT * FROM user_events 
WHERE user_id = ? 
AND timestamp > ?
LIMIT 10;

Avoid ALLOW FILTERING

-- ❌ Bad - Requires ALLOW FILTERING (slow)
SELECT * FROM users WHERE email = 'john@example.com' ALLOW FILTERING;

-- ✅ Good - Use materialized view
CREATE MATERIALIZED VIEW users_by_email AS
  SELECT * FROM users
  WHERE email IS NOT NULL
  PRIMARY KEY (email, user_id);

SELECT * FROM users_by_email WHERE email = 'john@example.com';

Redis Query Optimization

Use Appropriate Data Structures

// ❌ Bad - Store list as JSON string
await client.set('user:1:orders', JSON.stringify([1, 2, 3]));
const orders = JSON.parse(await client.get('user:1:orders'));

// ✅ Good - Use Redis list
await client.rPush('user:1:orders', ['1', '2', '3']);
const orders = await client.lRange('user:1:orders', 0, -1);

// ✅ Better - Use sorted set for ordered data
await client.zAdd('user:1:orders', [
  { score: Date.now(), value: '1' },
  { score: Date.now() + 1000, value: '2' }
]);

Pipeline Commands

// ❌ Bad - Multiple round trips
for (let i = 0; i < 100; i++) {
  await client.get(`key:${i}`);
}

// ✅ Good - Single round trip
const pipeline = client.multi();
for (let i = 0; i < 100; i++) {
  pipeline.get(`key:${i}`);
}
const results = await pipeline.exec();

Use Lua Scripts

// Atomic operations with single round trip
const script = `
  local values = {}
  for i, key in ipairs(KEYS) do
    values[i] = redis.call('GET', key)
  end
  return values
`;

const keys = Array.from({ length: 100 }, (_, i) => `key:${i}`);
const values = await client.eval(script, { keys });

DynamoDB Query Optimization

Use Query Instead of Scan

// ❌ Bad - Scan entire table
const command = new ScanCommand({
  TableName: 'Users',
  FilterExpression: 'email = :email',
  ExpressionAttributeValues: {
    ':email': 'john@example.com'
  }
});

// ✅ Good - Query with GSI
const command = new QueryCommand({
  TableName: 'Users',
  IndexName: 'EmailIndex',
  KeyConditionExpression: 'email = :email',
  ExpressionAttributeValues: {
    ':email': 'john@example.com'
  }
});

Projection Expressions

// ❌ Bad - Return all attributes
const command = new GetCommand({
  TableName: 'Users',
  Key: { userId: '123' }
});

// ✅ Good - Return only needed attributes
const command = new GetCommand({
  TableName: 'Users',
  Key: { userId: '123' },
  ProjectionExpression: 'userId, #n, email',
  ExpressionAttributeNames: {
    '#n': 'name'
  }
});

Batch Operations

// ❌ Bad - Multiple single gets
for (const id of userIds) {
  await docClient.send(new GetCommand({
    TableName: 'Users',
    Key: { userId: id }
  }));
}

// ✅ Good - Batch get
const command = new BatchGetCommand({
  RequestItems: {
    Users: {
      Keys: userIds.map(id => ({ userId: id }))
    }
  }
});
const response = await docClient.send(command);

General Optimization Techniques

Connection Pooling

// MongoDB connection pool
const client = new MongoClient(uri, {
  maxPoolSize: 50,
  minPoolSize: 10
});

// Cassandra connection pool
const client = new cassandra.Client({
  contactPoints: ['node1', 'node2'],
  pooling: {
    coreConnectionsPerHost: {
      [cassandra.types.distance.local]: 2,
      [cassandra.types.distance.remote]: 1
    }
  }
});

Caching

// Cache frequently accessed data
class UserService {
  async getUser(userId) {
    // Try cache first
    const cached = await redis.get(`user:${userId}`);
    if (cached) {
      return JSON.parse(cached);
    }
    
    // Load from database
    const user = await db.collection('users').findOne({ _id: userId });
    
    // Cache for 5 minutes
    await redis.setex(`user:${userId}`, 300, JSON.stringify(user));
    
    return user;
  }
}

Denormalization

// ❌ Bad - Multiple queries
const user = await db.users.findOne({ _id: userId });
const orders = await db.orders.find({ userId }).toArray();

// ✅ Good - Embedded data
const user = await db.users.findOne({ _id: userId });
// user.recentOrders already embedded

Monitoring and Profiling

// MongoDB profiler
db.setProfilingLevel(2);  // Log all operations
db.system.profile.find().sort({ ts: -1 }).limit(10);

// Slow query log
db.setProfilingLevel(1, { slowms: 100 });  // Log queries > 100ms

// .NET with monitoring
public class MongoService
{
    private readonly IMongoClient _client;
    
    public MongoService()
    {
        var settings = MongoClientSettings.FromConnectionString(connectionString);
        
        settings.ClusterConfigurator = cb =>
        {
            cb.Subscribe<CommandStartedEvent>(e =>
            {
                Console.WriteLine($"Command: {e.CommandName}");
                Console.WriteLine($"Query: {e.Command}");
            });
            
            cb.Subscribe<CommandSucceededEvent>(e =>
            {
                Console.WriteLine($"Duration: {e.Duration}");
            });
        };
        
        _client = new MongoClient(settings);
    }
}

Performance Checklist

const optimizationChecklist = {
  indexes: [
    'Create indexes on frequently queried fields',
    'Use compound indexes for multiple fields',
    'Drop unused indexes',
    'Monitor index usage'
  ],
  
  queries: [
    'Use projection to limit fields',
    'Limit result sets',
    'Use covered queries when possible',
    'Avoid scatter-gather queries'
  ],
  
  aggregation: [
    'Filter early with $match',
    'Limit fields with $project',
    'Use indexes for $match and $sort',
    'Use $limit to reduce processing'
  ],
  
  general: [
    'Use connection pooling',
    'Implement caching layer',
    'Denormalize when appropriate',
    'Monitor slow queries',
    'Use batch operations'
  ]
};

Interview Tips

Explain indexes: Critical for query performance
Show explain(): Analyze query execution
Demonstrate covered queries: No document fetch
Discuss projection: Limit returned fields
Mention caching: Redis for frequently accessed data
Show examples: MongoDB, Cassandra, DynamoDB

Summary

Optimize NoSQL queries with proper indexing, covered queries, projection, and result limiting. Use explain() to analyze performance. Filter early in aggregation pipelines. Query by partition/shard keys. Avoid full table scans. Use batch operations. Implement caching layer. Monitor slow queries. Denormalize when appropriate. Connection pooling for efficiency. Essential for production NoSQL performance.

Test Your Knowledge

Take a quick quiz to test your understanding of this topic.

Search

Search Coming Soon