Launch, train and communicate across all your accelerators,
ezpz
π.
π¦ Install #
1python3 -m pip install -e "git+https://github.com/saforem2/ezpz#egg=ezpz" --require-virtualenv
𧩠Features #
ezpz
simplifies the process of:
-
Setting up + launching distributed training:
-
pytorch
+DDP
:1import ezpz as ez 2RANK = ez.setup_torch(backend='DDP')
-
pytorch
+DeepSpeed
:1import ezpz as ez 2RANK = ez.setup_torch(backend='deepspeed') 3
when launched in an appropriate {
mpiexec
,srun
,launch
} command, will automatically setup all accelerators available in your job.-
import ezpz as ez
-
RANK =
ez.setup_torch(backend=backend)
forbackend
$\in$ {DDP
,deepspeed
,horovod
}- once setup,
RANK =
ez.get_rank()
- once setup,
-
LOCAL_RANK =
ez.get_local_rank()
-
WORLD_SIZE =
ez.get_world_size()
-
(see
ezpz/dist.py
for more details). -
-
Using your favorite framework:
-
framework=pytorch
+backend={DDP, deepspeed, horovod}
-
framework=tensorflow
+backend=horovod
-
ez.get_torch_device()
: {cuda
,xpu
,mps
,cpu
} -
ez.get_torch_backend()
: {nccl
,ccl
,gloo
}
2ez π. (see frameworks for additional details)
-
-
Writing device agnostic code:
-
ezpz.get_torch_device()
1>>> import ezpz as ez 2>>> DEVICE = ez.get_torch_device() 3>>> model = torch.nn.Linear(10, 10) 4>>> model.to(DEVICE) 5>>> x = torch.randn((10, 10), device=DEVICE) 6>>> y = model(x) 7>>> y.device 8device(type='mps', index=0)
-
-
Using
- [`ez.setup_wandb(project_name='ezpz')`](https://github.com/saforem2/ezpz/blob/main/src/ezpz/dist.py#L735)wandb
:
- Full support for any {
device
+framework
+backend
}:- device: {
GPU
,XPU
,MPS
,CPU
} - framework: {
torch
,deepspeed
,horovod
,tensorflow
} - backend: {
DDP
,deepspeed
,horovod
}
- device: {
π Launch #
[!IMPORTANT] We walk through a complete example below that will:
- Install
ezpz
launch
test_dist.py
across all the GPUs in your active {PBS
,slurm
} job
test_dist.py
1import os
2import logging
3import time
4from typing import Optional
5import torch
6import ezpz as ez
7
8# backend can be any of DDP, deespepeed, horovod
9RANK = ez.setup_torch(
10 backend=(
11 backend := os.environ.get('BACKEND', 'DDP')
12 ),
13 port=(
14 port := os.environ.get("MASTER_PORT", "29500")
15 )
16)
17# RANK = DIST_INIT['rank']
18# WORLD_SIZE = DIST_INIT['world_size']
19# LOCAL_RANK = DIST_INIT['local_rank']
20# if DEVICE == "cuda" and torch.cuda.is_available():
21# torch.cuda.set_device(LOCAL_RANK)
22DEVICE = ez.get_torch_device()
23WORLD_SIZE = ez.get_world_size()
24LOCAL_RANK = ez.get_local_rank()
25DEVICE_ID = f"{DEVICE}:{LOCAL_RANK}"
26
27
28# log only from RANK == 0
29logger = logging.getLogger(__name__)
30logger.setLevel("INFO") if RANK == 0 else logger.setLevel("CRITICAL")
31
32BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 64)) # 64
33INPUT_SIZE = int(os.environ.get("INPUT_SIZE", 128)) # 128
34OUTPUT_SIZE = int(os.environ.get("OUTPUT_SIZE", 128)) # 128
35DTYPE = os.environ.get("DTYPE", torch.get_default_dtype())
36TRAIN_ITERS = int(os.environ.get("TRAIN_ITERS", 50))
37
38# logger.info(f"{DIST_INIT=}")
39
40
41class Network(torch.nn.Module):
42 def __init__(
43 self,
44 input_dim: int = 128,
45 output_dim: int = 128,
46 sizes: Optional[list[int]] = None,
47 ):
48 super(Network, self).__init__()
49 if sizes is None:
50 self.layers = torch.nn.Linear(input_dim, output_dim)
51 elif len(sizes) > 0:
52 layers = [torch.nn.Linear(input_dim, sizes[0])]
53 for idx, size in enumerate(sizes[1:]):
54 layers.append(
55 torch.nn.Linear(sizes[idx], size)
56 )
57 layers.append(torch.nn.Linear(sizes[-1], output_dim))
58 self.layers = torch.nn.Sequential(*layers)
59
60 def forward(self, x: torch.Tensor) -> torch.Tensor:
61 return self.layers(x)
62
63
64def calc_loss(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
65 return (y - x).pow(2).sum()
66
67
68def plot_losses(losses: dict) -> None:
69 import plotext as pltx
70 # y = list(losses.values())
71 pltx.theme('clear')
72 pltx.scatter(list(losses.values()))
73 pltx.show()
74 pltx.save_fig("test_dist_losses.txt")
75 pltx.ylabel("loss")
76 pltx.xlabel("iteration")
77
78
79def main():
80 model = Network(
81 input_dim=INPUT_SIZE,
82 output_dim=OUTPUT_SIZE,
83 sizes=[1024, 512, 256, 128]
84 )
85 model.to(DEVICE)
86 model.to(DEVICE_ID)
87 logger.info(f'{model=}')
88 optimizer = torch.optim.Adam(model.parameters())
89 if backend.lower() == 'ddp':
90 if WORLD_SIZE > 1:
91 from torch.nn.parallel import DistributedDataParallel as DDP
92 model = DDP(
93 model,
94 device_ids=[]
95 )
96 elif backend.lower() in ('ds', 'deepspeed'):
97 import deepspeed
98 # config = ez.load_ds_config().update(
99 # {"train_micro_batch_size_per_gpu": BATCH_SIZE}
100 # )
101 import argparse
102 parser = argparse.ArgumentParser(
103 description='My training script.'
104 )
105 parser.add_argument(
106 '--local_rank',
107 required=False,
108 type=int,
109 default=-1,
110 # default=ez.get_local_rank()),
111 help='local rank passed from distributed launcher',
112 )
113 # Include DeepSpeed configuration arguments
114 parser = deepspeed.add_config_arguments(parser)
115 cmd_args = parser.parse_args()
116 logger.info(f'{cmd_args=}')
117 model, optimizer, *_ = deepspeed.initialize(
118 args=cmd_args,
119 model=model,
120 optimizer=optimizer,
121 )
122
123 losses = {}
124 for iter in range(TRAIN_ITERS):
125 t0 = time.perf_counter()
126 x = torch.rand((BATCH_SIZE, INPUT_SIZE), dtype=DTYPE).to(DEVICE)
127 y = model(x)
128 loss = calc_loss(x, y)
129 losses[iter] = loss
130 dtf = ((t1 := time.perf_counter()) - t0)
131 if backend == 'deepspeed':
132 model.backward(loss)
133 model.step(loss)
134 else:
135 loss.backward()
136 optimizer.step()
137 optimizer.zero_grad()
138 dtb = time.perf_counter() - t1
139 logger.info(
140 ', '.join([
141 f'{iter=}',
142 f'loss={loss.item():.5f}',
143 f'dt={dtf+dtb:.3f}',
144 f'{dtf=:.3f}',
145 f'{dtb=:.3f}'
146 ])
147 )
148 if RANK == 0:
149 plot_losses(losses)
150
151
152if __name__ == '__main__':
153 main()
ππ»ββοΈRunning #
-
Install:
1$ git clone https://github.com/saforem2/ezpz 2$ python3 -m pip install -e ezpz
-
[optional] If using
PBS
orslurm
:-
Save Job info:
-
1$ source ezpz/src/ezpz/bin/savejobenv 2βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 3β [savejobenv]: 4β β’ Writing PBS vars to: /home/foremans/.pbsenv 5βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 6βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 7β [HOSTS]: 8β β’ [host:0] - x1921c0s2b0n0.hostmgmt2000.cm.americas.sgi.com 9β β’ [host:1] - x1921c0s7b0n0.hostmgmt2000.cm.americas.sgi.com 10β β’ [host:2] - x1921c4s1b0n0.hostmgmt2000.cm.americas.sgi.com 11β β’ [host:3] - x1921c4s5b0n0.hostmgmt2000.cm.americas.sgi.com 12β β’ [host:4] - x1921c4s6b0n0.hostmgmt2000.cm.americas.sgi.com 13β β’ [host:5] - x1921c4s7b0n0.hostmgmt2000.cm.americas.sgi.com 14β β’ [host:6] - x1921c5s0b0n0.hostmgmt2000.cm.americas.sgi.com 15β β’ [host:7] - x1921c5s1b0n0.hostmgmt2000.cm.americas.sgi.com 16βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 17βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 18β [DIST INFO]: 19β β’ HOSTFILE=/var/spool/pbs/aux/9003148.amn-0001 20β β’ NHOSTS=8 21β β’ NGPU_PER_HOST=12 22β β’ NGPUS=96 23βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 24βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 25β [LAUNCH]: 26β β’ To launch across all available GPUs, use: 27β 'launch' ( = mpiexec --verbose --envall -n 96 -ppn 12 --hostfile /var/spool/pbs/aux/9003148.amn-0001 ) 28βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
-
-
-
Launch
test_dist.py
-
DDP:
1$ launch python3 -m ezpz.test_dist
-
DeepSpeed:
1$ BACKEND=deepspeed launch python3 -m ezpz.test_dist --deepspeed --deepspeed_config ezpz/src/ezpz/conf/ds_config.json
-
Output:
-
GPU
```bash $ launch python3 -m ezpz.test_dist |& tee ezpz-test-dist.log Connected to tcp://x3005c0s13b0n0.hsn.cm.polaris.alcf.anl.gov:7919 Found executable /lus/eagle/projects/datascience/foremans/miniconda3/envs/2024-04-20/bin/python3 Launching application 9e4c8311-1729-4385-b1d2-d4cd6006ac1d [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=1/7][local_rank=1/3][node=1/1] [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=5/7][local_rank=1/3][node=1/1] [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=3/7][local_rank=3/3][node=1/1] [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=7/7][local_rank=3/3][node=1/1] [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=4/7][local_rank=0/3][node=0/1] [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=6/7][local_rank=2/3][node=0/1] [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=2/7][local_rank=2/3][node=0/1] [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=0/7][local_rank=0/3][node=0/1] [2024-04-20 19:26:22][WARNING][dist:296] - Using [8 / 8] available "cuda" devices !! [2024-04-20 19:26:22][INFO][test_dist:46] - DIST_INIT={'world_size': 8, 'rank': 0, 'local_rank': 0} [2024-04-20 19:26:24][INFO][test_dist:84] - model=Network( (layers): Sequential( (0): Linear(in_features=128, out_features=1024, bias=True) (1): Linear(in_features=1024, out_features=512, bias=True) (2): Linear(in_features=512, out_features=256, bias=True) (3): Linear(in_features=256, out_features=128, bias=True) (4): Linear(in_features=128, out_features=128, bias=True) ) ) [2024-04-20 19:26:28][INFO][test_dist:126] - iter=0, loss=2789.99072, dt=0.664, dtf=0.659, dtb=0.005 [2024-04-20 19:26:28][INFO][test_dist:126] - iter=1, loss=1961.33459, dt=0.002, dtf=0.001, dtb=0.002 [2024-04-20 19:26:28][INFO][test_dist:126] - iter=2, loss=1450.47461, dt=0.002, dtf=0.000, dtb=0.002 [2024-04-20 19:26:28][INFO][test_dist:126] - iter=3, loss=1088.81958, dt=0.002, dtf=0.000, dtb=0.002 [2024-04-20 19:26:28][INFO][test_dist:126] - iter=4, loss=945.28839, dt=0.002, dtf=0.000, dtb=0.002 [2024-04-20 19:26:28][INFO][test_dist:126] - iter=5, loss=906.78857, dt=0.002, dtf=0.000, dtb=0.001 [2024-04-20 19:26:28][INFO][test_dist:126] - iter=6, loss=789.18243, dt=0.002, dtf=0.000, dtb=0.002 [2024-04-20 19:26:28][INFO][test_dist:126] - iter=7, loss=751.63477, dt=0.002, dtf=0.000, dtb=0.002 [2024-04-20 19:26:28][INFO][test_dist:126] - iter=8, loss=735.62915, dt=0.002, dtf=0.000, dtb=0.002 [2024-04-20 19:26:28][INFO][test_dist:126] - iter=9, loss=732.12775, dt=0.002, dtf=0.000, dtb=0.001 ```
-
XPU
1# [04:50:57 PM] [foremans@x1921c0s0b0n0] ~/q/llm.devkit/Megatron-DeepSpeed/dep/ezpz/s/ezpz ο main q4-drop 32s 2$ launch python3 -Wignore test_dist.py 3Connected to tcp://x1921c0s0b0n0.hostmgmt2000.cm.americas.sgi.com:7919 4Found executable /home/foremans/miniconda3/envs/q4-drop/bin/python3 5Launching application 5bf3e9e8-89fb-412a-a49e-3c81601436b7 6[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=9/23][local_rank=9/11][node=1/1] 7[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=14/23][local_rank=2/11][node=0/1] 8[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=3/23][local_rank=3/11][node=1/1] 9[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=17/23][local_rank=5/11][node=1/1] 10[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=6/23][local_rank=6/11][node=0/1] 11[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=13/23][local_rank=1/11][node=1/1] 12[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=7/23][local_rank=7/11][node=1/1] 13[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=19/23][local_rank=7/11][node=1/1] 14[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=8/23][local_rank=8/11][node=0/1] 15[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=21/23][local_rank=9/11][node=1/1] 16[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=10/23][local_rank=10/11][node=0/1] 17[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=22/23][local_rank=10/11][node=0/1] 18[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=11/23][local_rank=11/11][node=1/1] 19[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=23/23][local_rank=11/11][node=1/1] 20[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=2/23][local_rank=2/11][node=0/1] 21[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=20/23][local_rank=8/11][node=0/1] 22[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=4/23][local_rank=4/11][node=0/1] 23[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=15/23][local_rank=3/11][node=1/1] 24[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=18/23][local_rank=6/11][node=0/1] 25[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=12/23][local_rank=0/11][node=0/1] 26[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=1/23][local_rank=1/11][node=1/1] 27[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=16/23][local_rank=4/11][node=0/1] 28[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=5/23][local_rank=5/11][node=1/1] 29[2024-04-19 16:51:06][INFO][dist:239] - DistInfo={ 30 "DEVICE": "xpu", 31 "DEVICE_ID": "xpu:0", 32 "DISTRIBUTED_BACKEND": "ccl", 33 "GPUS_PER_NODE": 12, 34 "HOSTFILE": "/var/spool/pbs/aux/8992337.amn-0001", 35 "HOSTNAME": "x1921c0s0b0n0.hostmgmt2000.cm.americas.sgi.com", 36 "HOSTS": "['x1921c0s0b0n0', 'x1921c0s5b0n0']", 37 "LOCAL_RANK": 0, 38 "MACHINE": "SunSpot", 39 "NGPUS": 24, 40 "NODE_ID": 0, 41 "NUM_NODES": 2, 42 "RANK": 0, 43 "SCHEDULER": "PBS", 44 "WORLD_SIZE_IN_USE": 24, 45 "WORLD_SIZE_TOTAL": 24 46} 47[2024-04-19 16:51:06][INFO][dist:602] - Using oneccl_bindings from: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/torch-ccl/oneccl_bindings_for_pytorch/__init__.py 48[2024-04-19 16:51:06][INFO][dist:604] - Using ipex from: /home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/intel_extension_for_pytorch/__init__.py 49[2024-04-19 16:51:06][INFO][dist:605] - [0/24] Using device='xpu' with backend='DDP' + 'ccl' for distributed training. 50[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=0/23][local_rank=0/11][node=0/1] 51[2024-04-19 16:51:06][WARNING][dist:296] - Using [24 / 24] available "xpu" devices !! 522024:04:19-16:51:06:(16909) |CCL_WARN| MPI was initialized externally, CCL-MPI specific environment is ignored 53[2024-04-19 16:51:06][INFO][test_dist:71] - model=Network( 54 (layers): Sequential( 55 (0): Linear(in_features=128, out_features=1024, bias=True) 56 (1): Linear(in_features=1024, out_features=512, bias=True) 57 (2): Linear(in_features=512, out_features=256, bias=True) 58 (3): Linear(in_features=256, out_features=128, bias=True) 59 (4): Linear(in_features=128, out_features=128, bias=True) 60 ) 61) 62[2024-04-19 16:51:18][INFO][test_dist:101] - iter=0, loss=2709.53418, dt=1.380, dtf=0.950, dtb=0.430 63[2024-04-19 16:51:18][INFO][test_dist:101] - iter=1, loss=2058.49805, dt=0.133, dtf=0.002, dtb=0.131 64[2024-04-19 16:51:18][INFO][test_dist:101] - iter=2, loss=1507.91187, dt=0.004, dtf=0.001, dtb=0.004 65[2024-04-19 16:51:18][INFO][test_dist:101] - iter=3, loss=1181.78577, dt=0.004, dtf=0.001, dtb=0.003 66[2024-04-19 16:51:18][INFO][test_dist:101] - iter=4, loss=949.43561, dt=0.004, dtf=0.001, dtb=0.003 67[2024-04-19 16:51:18][INFO][test_dist:101] - iter=5, loss=848.14905, dt=0.004, dtf=0.001, dtb=0.003 68[2024-04-19 16:51:18][INFO][test_dist:101] - iter=6, loss=788.76123, dt=0.004, dtf=0.001, dtb=0.003 69[2024-04-19 16:51:18][INFO][test_dist:101] - iter=7, loss=753.59509, dt=0.004, dtf=0.001, dtb=0.003 70[2024-04-19 16:51:18][INFO][test_dist:101] - iter=8, loss=750.62225, dt=0.004, dtf=0.001, dtb=0.003 71[2024-04-19 16:51:18][INFO][test_dist:101] - iter=9, loss=740.23474, dt=0.004, dtf=0.001, dtb=0.003 72Application 5bf3e9e8 resources: utime=621s stime=111s maxrss=1746816KB inblock=192 oublock=16 minflt=10719359 majflt=7493 nvcsw=169332 nivcsw=77546
-
CPU
1$ TORCH_DEVICE=cpu mpirun -np 12 python3 test_dist.py 2[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=1/11][local_rank=1/11][node=0/0] 3[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=3/11][local_rank=3/11][node=0/0] 4[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=6/11][local_rank=6/11][node=0/0] 5[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=5/11][local_rank=5/11][node=0/0] 6[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=2/11][local_rank=2/11][node=0/0] 7[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=10/11][local_rank=10/11][node=0/0] 8[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=4/11][local_rank=4/11][node=0/0] 9[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=7/11][local_rank=7/11][node=0/0] 10[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=9/11][local_rank=9/11][node=0/0] 11[2024-04-19 14:44:13][INFO][dist:290] - [device='cpu'][rank=11/11][local_rank=11/11][node=0/0] 12[2024-04-19 14:44:13][INFO][dist:290] - [device='cpu'][rank=8/11][local_rank=8/11][node=0/0] 13[2024-04-19 14:44:13][INFO][dist:239] - DistInfo={ 14 "DEVICE": "cpu", 15 "DEVICE_ID": "cpu:0", 16 "DISTRIBUTED_BACKEND": "gloo", 17 "GPUS_PER_NODE": 12, 18 "HOSTFILE": "/Users/samforeman/projects/saforem2/ezpz/src/ezpz/hostfile", 19 "HOSTNAME": "Sams-MacBook-Pro.local", 20 "HOSTS": "['Sams-MacBook-Pro']", 21 "LOCAL_RANK": 0, 22 "MACHINE": "Sams-MacBook-Pro.local", 23 "NGPUS": 12, 24 "NODE_ID": 0, 25 "NUM_NODES": 1, 26 "RANK": 0, 27 "SCHEDULER": "LOCAL", 28 "WORLD_SIZE_IN_USE": 12, 29 "WORLD_SIZE_TOTAL": 12 30} 31[2024-04-19 14:44:13][INFO][dist:605] - [0/12] Using device='cpu' with backend='DDP' + 'gloo' for distributed training. 32[2024-04-19 14:44:13][INFO][dist:290] - [device='cpu'][rank=0/11][local_rank=0/11][node=0/0] 33[2024-04-19 14:44:13][WARNING][dist:296] - Using [12 / 12] available "cpu" devices !! 34[2024-04-19 14:44:13][INFO][test_dist:72] - model=Network( 35 (layers): Sequential( 36 (0): Linear(in_features=128, out_features=1024, bias=True) 37 (1): Linear(in_features=1024, out_features=512, bias=True) 38 (2): Linear(in_features=512, out_features=256, bias=True) 39 (3): Linear(in_features=256, out_features=128, bias=True) 40 (4): Linear(in_features=128, out_features=128, bias=True) 41 ) 42) 43[2024-04-19 14:44:14][INFO][test_dist:102] - iter=0, loss=2801.62549, dt=0.389, dtf=0.042, dtb=0.348 44[2024-04-19 14:44:14][INFO][test_dist:102] - iter=1, loss=2092.84692, dt=0.051, dtf=0.010, dtb=0.041 45[2024-04-19 14:44:14][INFO][test_dist:102] - iter=2, loss=1482.45520, dt=0.037, dtf=0.004, dtb=0.033 46[2024-04-19 14:44:14][INFO][test_dist:102] - iter=3, loss=1174.38037, dt=0.033, dtf=0.002, dtb=0.031 47[2024-04-19 14:44:14][INFO][test_dist:102] - iter=4, loss=938.39917, dt=0.032, dtf=0.003, dtb=0.030 48[2024-04-19 14:44:14][INFO][test_dist:102] - iter=5, loss=888.37390, dt=0.035, dtf=0.001, dtb=0.033 49[2024-04-19 14:44:14][INFO][test_dist:102] - iter=6, loss=784.63470, dt=0.036, dtf=0.003, dtb=0.032 50[2024-04-19 14:44:14][INFO][test_dist:102] - iter=7, loss=749.53839, dt=0.033, dtf=0.002, dtb=0.031 51[2024-04-19 14:44:14][INFO][test_dist:102] - iter=8, loss=732.22656, dt=0.036, dtf=0.003, dtb=0.034 52[2024-04-19 14:44:15][INFO][test_dist:102] - iter=9, loss=730.63776, dt=0.034, dtf=0.001, dtb=0.033 5335.68s user 17.20s system 546% cpu 9.681s total
-
-
π§° Helper Utilities #
We provide some shell scripts that are useful when working with a job
scheduler (e.g.Β PBS Pro
@ ALCF or slurm
elsewhere).
-
Shell script to save relevant job related environment variables to a file which can be
sourced
from new login instances.-
savejobenv
- Launch a job, clone (or navigate into)
ezpz
, andsource
src/ezpz/bin/savejobenv
:
1(thetalogin4) $ qsub-gpu -A datascience -n 2 -q full-node --attrs="filesystems=home,grand,eagle,theta-fs0:ssds=required" -t 06:00 -I 2Job routed to queue "full-node". 3Wait for job 10155652 to start... 4Opening interactive session to thetagpu04 5[...]
1(thetagpu04) $ git clone https://github.com/saforem2/ezpz 2(thetagpu04) $ source ezpz/src/ezpz/bin/savejobenv 3ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 4β Writing COBALT vars to /home/foremans/.cobaltenv 5β HOSTFILE: /var/tmp/cobalt.10155652 6β NHOSTS: 2 7β 8 GPUs per host 8β 16 GPUs total 9ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 10ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 11β [DIST INFO]: 12β β’ Writing Job info to /home/foremans/.cobaltenv 13β β’ HOSTFILE: /var/tmp/cobalt.10155652 14β β’ NHOSTS: 2 15β β’ NGPU_PER_HOST: 8 16β β’ NGPUS = (NHOSTS * NGPU_PER_HOST) = 16 17β [Hosts]: 18β β’ thetagpu04 thetagpu19 19β [Launch]: 20β β’ Use: 'launch' (=mpirun -n -N --hostfile /var/tmp/cobalt.10155652 -x PATH -x LD_LIBRARY_PATH) 21β to launch job 22ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 23βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 24β YOU ARE HERE: /home/foremans 25β Run 'source ./bin/getjobenv' in a NEW SHELL to automatically set env vars 26βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
- Launch a job, clone (or navigate into)
-
-
Shell script that, when sourced, will populate the current environment with the necessary job-related variables.
-
getjobenv
-
Now, in a NEW SHELL
1(localhost) $ ssh <user>@theta
1(thetalogin4) $ ssh thetagpu19
1(thetagpu19) $ module load conda/2023-01-11; conda activate base 2(thetagpu19) $ cd ezpz 3(thetagpu19) $ source ./src/ezpz/bin/getjobenv 4βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 5β [Hosts]: 6β β’ thetagpu04, thetagpu19 7βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 8βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 9β [DIST INFO]: 10β β’ Loading job env from: /home/foremans/.cobaltenv 11β β’ HOSTFILE: /var/tmp/cobalt.10155652 12β β’ NHOSTS: 2 13β β’ NGPU_PER_HOST: 8 14β β’ NGPUS (NHOSTS x NGPU_PER_HOST): 16 15β β’ DIST_LAUNCH: mpirun -n 16 -N 8 --hostfile /var/tmp/cobalt.10155652 -x PATH -x LD_LIBRARY_PATH 16β β’ Defining alias: launch: aliased to mpirun -n 16 -N 8 --hostfile /var/tmp/cobalt.10155652 -x PATH -x LD_LIBRARY_PATH 17βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ 18(thetagpu19) $ mkdir -p venvs/thetaGPU/2023-01-11 19(thetagpu19) $ python3 -m venv venvs/thetaGPU/2023-01-11 --system-site-packages 20(thetagpu19) $ source venvs/thetaGPU/2023-01-11/bin/activate 21(thetagpu19) $ python3 -m pip install -e . --require-virtualenv 22(thetagpu19) $ launch python3 -m ezpz framework=pytorch backend=DDP 23[2023-10-26 12:21:26,716][ezpz.dist][INFO] - Using DDP for distributed training 24[2023-10-26 12:21:26,787][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 13 25[2023-10-26 12:21:26,787][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 14 26[2023-10-26 12:21:26,787][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 8 27[2023-10-26 12:21:26,787][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 12 28[2023-10-26 12:21:26,787][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 6 29[2023-10-26 12:21:26,788][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 9 30[2023-10-26 12:21:26,787][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 10 31[2023-10-26 12:21:26,788][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 15 32[2023-10-26 12:21:26,788][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 11 33[2023-10-26 12:21:26,789][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 7 34[2023-10-26 12:21:26,789][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 3 35[2023-10-26 12:21:26,789][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 36[2023-10-26 12:21:26,789][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 4 37[2023-10-26 12:21:26,789][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 5 38[2023-10-26 12:21:26,789][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 2 39[2023-10-26 12:21:26,798][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 40[2023-10-26 12:21:26,811][torch.distributed.distributed_c10d][INFO] - Rank 14: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 41[2023-10-26 12:21:26,812][torch.distributed.distributed_c10d][INFO] - Rank 6: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 42[2023-10-26 12:21:26,814][torch.distributed.distributed_c10d][INFO] - Rank 13: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 43[2023-10-26 12:21:26,815][torch.distributed.distributed_c10d][INFO] - Rank 7: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 44[2023-10-26 12:21:26,816][torch.distributed.distributed_c10d][INFO] - Rank 8: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 45[2023-10-26 12:21:26,817][torch.distributed.distributed_c10d][INFO] - Rank 3: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 46[2023-10-26 12:21:26,819][torch.distributed.distributed_c10d][INFO] - Rank 12: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 47[2023-10-26 12:21:26,820][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 48[2023-10-26 12:21:26,821][torch.distributed.distributed_c10d][INFO] - Rank 10: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 49[2023-10-26 12:21:26,823][torch.distributed.distributed_c10d][INFO] - Rank 4: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 50[2023-10-26 12:21:26,825][torch.distributed.distributed_c10d][INFO] - Rank 9: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 51[2023-10-26 12:21:26,825][torch.distributed.distributed_c10d][INFO] - Rank 5: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 52[2023-10-26 12:21:26,827][torch.distributed.distributed_c10d][INFO] - Rank 15: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 53[2023-10-26 12:21:26,828][torch.distributed.distributed_c10d][INFO] - Rank 2: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 54[2023-10-26 12:21:26,830][torch.distributed.distributed_c10d][INFO] - Rank 11: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 55[2023-10-26 12:21:26,831][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 16 nodes. 56[2023-10-26 12:21:27,035][ezpz.dist][INFO] - RANK: 0 / 15 57{ 58 "framework": "pytorch", 59 "backend": "DDP", 60 "use_wandb": false, 61 "seed": null, 62 "port": null, 63 "ds_config_path": null, 64 "wandb_project_name": null, 65 "precision": null, 66 "ngpus": null 67} 68[2023-10-26 12:21:27,038][__main__][INFO] - Output dir: /lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/ezpz/outputs/runs/pytorch/DDP/2023-10-26/12-21-25 69[2023-10-26 12:21:27,097][ezpz.dist][INFO] - RANK: 8 / 15 70[2023-10-26 12:21:27,103][ezpz.dist][INFO] - RANK: 6 / 15 71[2023-10-26 12:21:27,104][ezpz.dist][INFO] - RANK: 14 / 15 72[2023-10-26 12:21:27,111][ezpz.dist][INFO] - RANK: 13 / 15 73[2023-10-26 12:21:27,116][ezpz.dist][INFO] - RANK: 1 / 15 74[2023-10-26 12:21:27,126][ezpz.dist][INFO] - RANK: 7 / 15 75[2023-10-26 12:21:27,135][ezpz.dist][INFO] - RANK: 10 / 15 76[2023-10-26 12:21:27,139][ezpz.dist][INFO] - RANK: 12 / 15 77[2023-10-26 12:21:27,141][ezpz.dist][INFO] - RANK: 9 / 15 78[2023-10-26 12:21:27,141][ezpz.dist][INFO] - RANK: 15 / 15 79[2023-10-26 12:21:27,141][ezpz.dist][INFO] - RANK: 11 / 15 80[2023-10-26 12:21:27,141][ezpz.dist][INFO] - RANK: 5 / 15 81[2023-10-26 12:21:27,144][ezpz.dist][INFO] - RANK: 2 / 15 82[2023-10-26 12:21:27,145][ezpz.dist][INFO] - RANK: 4 / 15 83[2023-10-26 12:21:27,145][ezpz.dist][INFO] - RANK: 3 / 15 8416.56s user 30.05s system 706% cpu 6.595s total
while this example looked at ThetaGPU, the exact same process will work on any of
{ThetaGPU, Polaris, Perlmutter}
.
-
-
β€οΈβπ©Ή Status
Last Updated: 05/13/2024 @ 22:04:56