System Design Interview: Design Dropbox / Google Drive (Cloud File Storage)
Cloud file storage like Dropbox or Google Drive is a popular system design question testing file chunking, sync protocols, conflict resolution, and large-scale object storage. Commonly asked at Dropbox, Google, Microsoft (OneDrive), Box, and Apple (iCloud).
Requirements Clarification
Functional Requirements
- Upload, download, delete files and folders
- Sync files across multiple devices automatically
- Share files/folders with other users (view, edit permissions)
- Version history: restore previous file versions
- Offline access: changes sync when device comes online
- Collaboration: multiple users editing simultaneously (Google Docs scope: simplified)
Non-Functional Requirements
- Scale: 500M users, 50 PB total storage, 10M concurrent connected devices
- File sizes: small (KB) to large (GB)
- Sync latency: changes visible on other devices within 5 seconds
- Bandwidth efficiency: only upload changed portions of files (delta sync)
- High durability: 99.999999999% (eleven nines) — use triple redundancy
High-Level Architecture
Desktop/Mobile Client
↓
Upload Service → Block Storage (S3/GCS)
Metadata Service → PostgreSQL (file tree, versions)
Sync Service → WebSocket connections
Notification Service → Push/WebSocket
↓
CDN (for downloads of popular files)
Core Innovation: Block-Level Deduplication
import hashlib
from typing import Optional
BLOCK_SIZE = 4 * 1024 * 1024 # 4MB blocks
def split_file_into_blocks(file_path: str) -> list[dict]:
"""
Split file into fixed-size blocks.
Each block identified by SHA-256 hash of its content.
Same content = same hash = no upload needed (deduplication).
"""
blocks = []
with open(file_path, 'rb') as f:
while True:
data = f.read(BLOCK_SIZE)
if not data:
break
block_hash = hashlib.sha256(data).hexdigest()
blocks.append({
'hash': block_hash,
'size': len(data),
'data': data, # Only included before upload check
})
return blocks
class BlockStore:
"""
Content-addressed block storage.
Block is stored by its hash — if hash exists, block already stored.
This achieves global deduplication across all users.
"""
def __init__(self, s3_client, bucket: str):
self.s3 = s3_client
self.bucket = bucket
def upload_block(self, block_hash: str, data: bytes) -> bool:
"""Upload block if it doesn't exist (deduplication)"""
if self.block_exists(block_hash):
return False # Already stored, skip upload
self.s3.put_object(
Bucket=self.bucket,
Key=f"blocks/{block_hash[:2]}/{block_hash}", # Prefix sharding
Body=data,
ServerSideEncryption='AES256',
)
return True
def block_exists(self, block_hash: str) -> bool:
try:
self.s3.head_object(
Bucket=self.bucket,
Key=f"blocks/{block_hash[:2]}/{block_hash}"
)
return True
except self.s3.exceptions.ClientError:
return False
def download_block(self, block_hash: str) -> bytes:
response = self.s3.get_object(
Bucket=self.bucket,
Key=f"blocks/{block_hash[:2]}/{block_hash}"
)
return response['Body'].read()
File Metadata Model
CREATE TABLE files (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
owner_id UUID NOT NULL,
path TEXT NOT NULL, -- /Documents/report.pdf
name VARCHAR(255) NOT NULL,
size_bytes BIGINT NOT NULL DEFAULT 0,
mime_type VARCHAR(100),
is_folder BOOLEAN NOT NULL DEFAULT FALSE,
parent_id UUID REFERENCES files(id),
created_at TIMESTAMPTZ DEFAULT NOW(),
modified_at TIMESTAMPTZ DEFAULT NOW(),
is_deleted BOOLEAN DEFAULT FALSE, -- soft delete
UNIQUE(owner_id, path)
);
CREATE TABLE file_versions (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
file_id UUID NOT NULL REFERENCES files(id),
version_number INT NOT NULL,
size_bytes BIGINT NOT NULL,
block_hashes TEXT[] NOT NULL, -- ordered list of block hashes
created_at TIMESTAMPTZ DEFAULT NOW(),
created_by UUID NOT NULL, -- which device/user created this version
UNIQUE(file_id, version_number)
);
CREATE TABLE file_shares (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
file_id UUID NOT NULL REFERENCES files(id),
shared_with UUID NOT NULL, -- user_id
permission VARCHAR(10) NOT NULL, -- READ, WRITE, ADMIN
created_at TIMESTAMPTZ DEFAULT NOW(),
expires_at TIMESTAMPTZ
);
-- Index for efficient sync queries
CREATE INDEX idx_files_owner_modified ON files(owner_id, modified_at DESC);
CREATE INDEX idx_versions_file ON file_versions(file_id, version_number DESC);
Upload Flow: Delta Sync
class DropboxClient:
"""
Client-side logic for uploading and syncing files.
Key optimization: only upload blocks that the server doesn't have.
"""
def __init__(self, server_api, block_store):
self.api = server_api
self.block_store = block_store
self.local_state = LocalSyncState() # tracks last-known server state
def upload_file(self, local_path: str, remote_path: str):
"""Upload file with deduplication — only send new/changed blocks"""
# Step 1: Split into blocks
blocks = split_file_into_blocks(local_path)
block_hashes = [b['hash'] for b in blocks]
# Step 2: Ask server which blocks it already has
missing_hashes = self.api.check_blocks(block_hashes)
# Server returns list of hashes NOT in its storage
# Step 3: Upload only missing blocks
for block in blocks:
if block['hash'] in missing_hashes:
self.block_store.upload_block(block['hash'], block['data'])
# Step 4: Commit file metadata (block list, version)
version = self.api.commit_file(
path=remote_path,
block_hashes=block_hashes,
size=sum(b['size'] for b in blocks),
)
print(f"Uploaded {remote_path} as version {version}")
def check_blocks_client_side(self, blocks: list[dict]) -> dict:
"""
For large files, pre-check hashes on client before uploading.
Returns {hash: exists} for all blocks.
"""
hashes = [b['hash'] for b in blocks]
return self.api.batch_check_blocks(hashes) # server returns dict
class SyncService:
"""
Server-side sync orchestration.
Notifies connected devices when files change.
Uses long-polling or WebSocket connections.
"""
def __init__(self, db, notification_service):
self.db = db
self.notifier = notification_service
self.connected_devices: dict[str, str] = {} # device_id -> websocket
def commit_file_upload(self, user_id: str, path: str,
block_hashes: list[str], size: int) -> int:
"""
Commit file upload:
1. Create/update file record
2. Create new version
3. Notify user's other devices
"""
with self.db.transaction():
# Upsert file record
file_id = self.db.upsert_file(user_id, path, size)
# Create new version
current_version = self.db.get_latest_version(file_id)
new_version = (current_version or 0) + 1
self.db.create_version(file_id, new_version, block_hashes, size, user_id)
# Notify other devices (async — outside transaction)
self.notifier.notify_user_devices(user_id, {
'type': 'FILE_CHANGED',
'path': path,
'version': new_version,
'size': size,
}, exclude_device=None) # Notify all devices
return new_version
def get_changes_since(self, user_id: str, cursor: str) -> list[dict]:
"""
Pull-based sync: client sends its cursor (last sync timestamp or version).
Server returns all changes since cursor.
Used for: reconnecting after offline, initial sync on new device.
"""
changes = self.db.fetch("""
SELECT f.path, fv.version_number, fv.block_hashes, fv.size_bytes,
fv.created_at, f.is_deleted
FROM file_versions fv
JOIN files f ON f.id = fv.file_id
WHERE f.owner_id = $1
AND fv.created_at > $2
ORDER BY fv.created_at ASC
LIMIT 1000
""", user_id, cursor)
new_cursor = changes[-1]['created_at'].isoformat() if changes else cursor
return {'changes': changes, 'cursor': new_cursor}
Conflict Resolution
def resolve_conflict(client_version: dict, server_version: dict) -> str:
"""
When two devices edit the same file offline simultaneously:
Dropbox strategy: last-writer-wins + create conflict copy.
More sophisticated (Google Drive): Operational Transform or CRDTs
for real-time collaborative editing.
"""
# Simple strategy: keep both versions
if client_version['modified_at'] > server_version['modified_at']:
# Client has newer changes — upload as new version
# Create conflict copy of server version
conflict_name = f"{server_version['name']} (conflict from Device B)"
create_conflict_copy(server_version, conflict_name)
return 'CLIENT_WINS'
else:
# Server is newer — download server version
# Create conflict copy of client changes
conflict_name = f"{client_version['name']} (conflict copy)"
return 'SERVER_WINS'
Key Design Decisions
- Block size 4-8MB: Larger blocks = fewer requests but more wasted upload when file changes. Dropbox uses variable-size blocks (Rabin fingerprinting for content-defined chunking) to handle insertions/deletions better than fixed-size blocks.
- Content-addressed storage: Block hash = block identity. Same content = same storage. Global deduplication: if 1M users upload the same popular video, only 1 copy stored. Estimate: 30-50% storage savings from deduplication.
- Notification via WebSocket for push, long-poll as fallback: When file changes, server pushes to connected devices immediately. For mobile (battery/network constraints): use push notifications via APNS/FCM.
- Version limits: Dropbox keeps 30 days of version history (Plus: 180 days). Implement as TTL-based cleanup job that deletes old file_versions records and their blocks (if no other version references them).