Requirements
Functional: send notifications via multiple channels (push, email, SMS, in-app), support notification templates with variable substitution, allow users to set preferences (opt-in/opt-out per channel per category), batch notifications (digest), deduplicate (don’t send the same alert twice), track delivery status.
Non-functional: at-least-once delivery, async (never block the triggering action), channel fallback (if push fails, try email), scalable to millions of notifications/day.
Core Entities
from enum import Enum
from dataclasses import dataclass, field
from typing import Optional, List, Dict
from datetime import datetime
class Channel(Enum):
PUSH = "PUSH"
EMAIL = "EMAIL"
SMS = "SMS"
INAPP = "INAPP"
class NotificationType(Enum):
ORDER_CONFIRMED = "ORDER_CONFIRMED"
PAYMENT_FAILED = "PAYMENT_FAILED"
MESSAGE_RECEIVED = "MESSAGE_RECEIVED"
PRICE_DROP = "PRICE_DROP"
ACCOUNT_SECURITY = "ACCOUNT_SECURITY"
@dataclass
class NotificationTemplate:
template_id: str
notification_type: NotificationType
channel: Channel
subject: str # for email
body: str # supports {{variable}} placeholders
priority: int # 1=critical, 2=high, 3=normal, 4=low
@dataclass
class UserPreferences:
user_id: str
# channel -> list of types the user wants on that channel
subscriptions: Dict[Channel, List[NotificationType]] = field(default_factory=dict)
dnd_start: Optional[int] = None # hour 0-23: do not disturb window start
dnd_end: Optional[int] = None
@dataclass
class NotificationRequest:
request_id: str
user_id: str
notification_type: NotificationType
variables: Dict[str, str] # template variable values
idempotency_key: str # prevent duplicate sends
priority: int = 3
scheduled_at: Optional[datetime] = None
@dataclass
class NotificationRecord:
record_id: str
request_id: str
user_id: str
channel: Channel
notification_type: NotificationType
status: str # 'PENDING' | 'SENT' | 'DELIVERED' | 'FAILED' | 'SKIPPED'
sent_at: Optional[datetime] = None
error: Optional[str] = None
external_id: Optional[str] = None # FCM message ID, SendGrid ID, etc.
Notification Service Architecture
class NotificationService:
def send(self, request: NotificationRequest) -> List[NotificationRecord]:
# Idempotency check
if db.notification_exists(request.idempotency_key):
return db.get_records_by_idempotency(request.idempotency_key)
prefs = db.get_user_preferences(request.user_id)
channels = self._select_channels(request, prefs)
records = []
for channel in channels:
template = db.get_template(request.notification_type, channel)
if not template: continue
body = self._render(template.body, request.variables)
record = NotificationRecord(
record_id=generate_id(),
request_id=request.request_id,
user_id=request.user_id,
channel=channel,
notification_type=request.notification_type,
status='PENDING',
)
db.save(record)
# Dispatch to channel-specific queue
self._enqueue(channel, record, body, template.subject)
records.append(record)
return records
def _select_channels(self, request, prefs: UserPreferences) -> List[Channel]:
all_channels = [Channel.PUSH, Channel.EMAIL, Channel.INAPP]
# ACCOUNT_SECURITY is always sent (cannot opt out)
if request.notification_type == NotificationType.ACCOUNT_SECURITY:
return [Channel.PUSH, Channel.EMAIL]
# Filter by user preferences
selected = []
for ch in all_channels:
user_types = prefs.subscriptions.get(ch, [])
if request.notification_type in user_types:
selected.append(ch)
return selected or [Channel.INAPP] # always at least in-app
def _render(self, template: str, variables: dict) -> str:
for key, value in variables.items():
template = template.replace('{{' + key + '}}', value)
return template
Channel Handlers
class PushHandler:
def send(self, user_id: str, body: str, record: NotificationRecord):
device_tokens = db.get_device_tokens(user_id)
if not device_tokens:
record.status = 'SKIPPED'
record.error = 'No device tokens'
db.save(record); return
for token in device_tokens:
try:
response = fcm_client.send({
'token': token,
'notification': {'title': 'Notification', 'body': body},
})
record.status = 'SENT'
record.external_id = response.message_id
except Exception as e:
if 'UNREGISTERED' in str(e):
db.remove_device_token(token) # token expired
record.status = 'FAILED'
record.error = str(e)
db.save(record)
class EmailHandler:
def send(self, user_id: str, subject: str, body: str, record: NotificationRecord):
user = db.get_user(user_id)
if not user.email_verified:
record.status = 'SKIPPED'; db.save(record); return
response = sendgrid_client.send(
to=user.email, subject=subject, html_body=body
)
record.status = 'SENT'
record.external_id = response.headers.get('X-Message-Id')
db.save(record)
Digest / Batching
class DigestService:
"""Batch low-priority notifications into a daily digest."""
DIGEST_TYPES = {NotificationType.PRICE_DROP}
def queue_for_digest(self, request: NotificationRequest):
db.add_to_digest_queue(request.user_id, request)
def send_daily_digest(self, user_id: str):
pending = db.get_digest_queue(user_id)
if not pending: return
grouped = {}
for req in pending:
grouped.setdefault(req.notification_type.value, []).append(req)
body = self._render_digest(grouped)
self._send_email(user_id, 'Your Daily Digest', body)
db.clear_digest_queue(user_id)
Deduplication
DEDUP_WINDOW_SECONDS = 3600 # 1 hour
def is_duplicate(user_id: str, notification_type: str, dedup_key: str) -> bool:
cache_key = f"notif_dedup:{user_id}:{notification_type}:{dedup_key}"
result = r.set(cache_key, 1, ex=DEDUP_WINDOW_SECONDS, nx=True)
return result is None # None = key existed → duplicate
Interview Questions
Q: How do you handle a user who has push notifications but is currently offline?
FCM (Firebase Cloud Messaging) queues push notifications when a device is offline and delivers them when the device reconnects (up to 4 weeks). The notification service sends to FCM regardless of online status — FCM handles the delivery. FCM returns a success even if the device is offline (it accepts the message). The device receives it when it comes online. For time-sensitive notifications (flash sale ends in 1 hour), set a TTL on the FCM message — if undelivered within TTL, discard rather than showing stale content.
Q: How would you scale this to 100 million notifications per day?
Use Kafka as the message bus — one topic per channel (push-notifications, email-notifications, sms-notifications). Producers (notification service) publish to Kafka. Consumers (channel handlers) are horizontally scaled worker pools. Each channel has different throughput: push = millions/minute (FCM handles batching), email = rate-limited by ESP (e.g., SendGrid = 100K/hour on basic plan → use multiple API keys or premium tier), SMS = expensive, rate-limited (use only for critical). Priority queues: critical notifications (ACCOUNT_SECURITY) get their own Kafka partition processed first.
{
“@context”: “https://schema.org”,
“@type”: “FAQPage”,
“mainEntity”: [
{
“@type”: “Question”,
“name”: “How do you design a notification service that supports push, email, and SMS?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Use a channel abstraction: define a NotificationChannel interface with a send(user_id, template, data) method. Implement PushChannel (FCM/APNs), EmailChannel (SendGrid/SES), SMSChannel (Twilio). A NotificationRouter maps notification_type to the list of channels to use. User preferences table stores per-user opt-in/opt-out per channel and notification type. The entry point is a Notification event on Kafka. The Notification Service consumes the event, loads user preferences, filters to opted-in channels, renders the template for each channel, and dispatches to each channel handler. Channel handlers are independent — push failure does not block email delivery.”
}
},
{
“@type”: “Question”,
“name”: “How do you prevent duplicate notifications from being sent?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Use a deduplication key: hash(user_id + notification_type + reference_id + channel). Before sending, SET NX (Redis) with this key and a TTL matching your deduplication window (e.g., 24 hours for transactional, 1 hour for digest). If the SET returns 0 (key exists), skip this notification — it was already sent. If 1, proceed with delivery. Store the deduplication key in the notifications_sent table for audit trails. For idempotent retry on delivery failure (FCM returned 5xx), re-enqueue with the same deduplication key — the next attempt finds the key only if the send actually succeeded, so mark dedup AFTER confirmed delivery.”
}
},
{
“@type”: “Question”,
“name”: “How do you implement notification templates with variable substitution?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Store templates in a database keyed by (notification_type, channel, locale). Template body uses {{variable}} placeholders: Hello {{first_name}}, your order {{order_id}} has shipped. Template rendering: replace all {{key}} with values from the data payload using a regex or string replacement. Use Handlebars for richer templates (conditionals, loops). Versioning: templates have a version field; the notification event can specify which version to use (useful for A/B testing). Cache rendered templates in Redis for identical payloads (same type + same data hash). Store the rendered content on the notification record for debugging and auditing.”
}
},
{
“@type”: “Question”,
“name”: “How do you implement digest notifications to avoid alert fatigue?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “Instead of sending an email per event, aggregate events into a digest. Store events in a pending_notifications table with (user_id, notification_type, payload, created_at). A scheduled job runs every 15 minutes: SELECT all pending notifications per user, group by type, render a digest template with all items, send one email/push, mark rows as sent. Digest windows: some notifications are immediate (security alerts, OTP), others are digest-able (weekly summary, comment notifications). Store delivery_mode=IMMEDIATE or DIGEST on the notification type config. Users can configure their digest frequency (real-time, hourly, daily) in preferences.”
}
},
{
“@type”: “Question”,
“name”: “How do you scale a notification service to send 10 million notifications per hour?”,
“acceptedAnswer”: {
“@type”: “Answer”,
“text”: “10M/hour = 2800/second. Fan-out the work: Kafka partitioned by user_id (keeps ordering per user). Multiple consumer groups — one per channel. Each channel’s consumer pool auto-scales independently: push volume >> email volume. For FCM push: use batch send API (up to 500 tokens per request), reducing API calls by 500x. For email: use SendGrid batch API (1000 per request). Rate limit per channel: FCM allows 600K messages/minute per project. Separate high-priority queue (OTP, payment alerts) from low-priority (marketing). Cache user device tokens and preferences in Redis — avoid DB read per notification. Dead letter queue for failed deliveries with exponential backoff retry.”
}
}
]
}
Asked at: Snap Interview Guide
Asked at: DoorDash Interview Guide
Asked at: Shopify Interview Guide
Asked at: Stripe Interview Guide