πŸ‹ ezpz

· sf's blog

Sam Foreman

Launch, train and communicate across all your accelerators, ezpz πŸ‹.

πŸ“¦ Install #

1python3 -m pip install -e "git+https://github.com/saforem2/ezpz#egg=ezpz" --require-virtualenv

🧩 Features #

ezpz simplifies the process of:

πŸš€ Launch #

[!IMPORTANT] We walk through a complete example below that will:

  1. Install ezpz
  2. launch test_dist.py across all the GPUs in your active {PBS, slurm} job
test_dist.py
  1import os
  2import logging
  3import time
  4from typing import Optional
  5import torch
  6import ezpz as ez
  7
  8# backend can be any of DDP, deespepeed, horovod
  9RANK = ez.setup_torch(
 10  backend=(
 11      backend := os.environ.get('BACKEND', 'DDP')
 12  ),
 13  port=(
 14      port := os.environ.get("MASTER_PORT", "29500")
 15  )
 16)
 17# RANK = DIST_INIT['rank']
 18# WORLD_SIZE = DIST_INIT['world_size']
 19# LOCAL_RANK = DIST_INIT['local_rank']
 20# if DEVICE == "cuda" and torch.cuda.is_available():
 21#     torch.cuda.set_device(LOCAL_RANK)
 22DEVICE = ez.get_torch_device()
 23WORLD_SIZE = ez.get_world_size()
 24LOCAL_RANK = ez.get_local_rank()
 25DEVICE_ID = f"{DEVICE}:{LOCAL_RANK}"
 26
 27
 28# log only from RANK == 0
 29logger = logging.getLogger(__name__)
 30logger.setLevel("INFO") if RANK == 0 else logger.setLevel("CRITICAL")
 31
 32BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 64))  # 64
 33INPUT_SIZE = int(os.environ.get("INPUT_SIZE", 128))  # 128
 34OUTPUT_SIZE = int(os.environ.get("OUTPUT_SIZE", 128))  # 128
 35DTYPE = os.environ.get("DTYPE", torch.get_default_dtype())
 36TRAIN_ITERS = int(os.environ.get("TRAIN_ITERS", 50))
 37
 38# logger.info(f"{DIST_INIT=}")
 39
 40
 41class Network(torch.nn.Module):
 42  def __init__(
 43          self,
 44          input_dim: int = 128,
 45          output_dim: int = 128,
 46          sizes: Optional[list[int]] = None,
 47  ):
 48      super(Network, self).__init__()
 49      if sizes is None:
 50          self.layers = torch.nn.Linear(input_dim, output_dim)
 51      elif len(sizes) > 0:
 52          layers = [torch.nn.Linear(input_dim, sizes[0])]
 53          for idx, size in enumerate(sizes[1:]):
 54              layers.append(
 55                  torch.nn.Linear(sizes[idx], size)
 56              )
 57          layers.append(torch.nn.Linear(sizes[-1], output_dim))
 58          self.layers = torch.nn.Sequential(*layers)
 59
 60  def forward(self, x: torch.Tensor) -> torch.Tensor:
 61      return self.layers(x)
 62
 63
 64def calc_loss(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 65  return (y - x).pow(2).sum()
 66
 67
 68def plot_losses(losses: dict) -> None:
 69  import plotext as pltx
 70  # y = list(losses.values())
 71  pltx.theme('clear')
 72  pltx.scatter(list(losses.values()))
 73  pltx.show()
 74  pltx.save_fig("test_dist_losses.txt")
 75  pltx.ylabel("loss")
 76  pltx.xlabel("iteration")
 77
 78
 79def main():
 80  model = Network(
 81      input_dim=INPUT_SIZE,
 82      output_dim=OUTPUT_SIZE,
 83      sizes=[1024, 512, 256, 128]
 84  )
 85  model.to(DEVICE)
 86  model.to(DEVICE_ID)
 87  logger.info(f'{model=}')
 88  optimizer = torch.optim.Adam(model.parameters())
 89  if backend.lower() == 'ddp':
 90      if WORLD_SIZE > 1:
 91          from torch.nn.parallel import DistributedDataParallel as DDP
 92          model = DDP(
 93              model,
 94              device_ids=[]
 95          )
 96  elif backend.lower() in ('ds', 'deepspeed'):
 97      import deepspeed
 98      # config = ez.load_ds_config().update(
 99      #     {"train_micro_batch_size_per_gpu": BATCH_SIZE}
100      # )
101      import argparse
102      parser = argparse.ArgumentParser(
103          description='My training script.'
104      )
105      parser.add_argument(
106          '--local_rank',
107          required=False,
108          type=int,
109          default=-1,
110          # default=ez.get_local_rank()),
111          help='local rank passed from distributed launcher',
112      )
113      # Include DeepSpeed configuration arguments
114      parser = deepspeed.add_config_arguments(parser)
115      cmd_args = parser.parse_args()
116      logger.info(f'{cmd_args=}')
117      model, optimizer, *_ = deepspeed.initialize(
118          args=cmd_args,
119          model=model,
120          optimizer=optimizer,
121      )
122
123  losses = {}
124  for iter in range(TRAIN_ITERS):
125      t0 = time.perf_counter()
126      x = torch.rand((BATCH_SIZE, INPUT_SIZE), dtype=DTYPE).to(DEVICE)
127      y = model(x)
128      loss = calc_loss(x, y)
129      losses[iter] = loss
130      dtf = ((t1 := time.perf_counter()) - t0)
131      if backend == 'deepspeed':
132          model.backward(loss)
133          model.step(loss)
134      else:
135          loss.backward()
136          optimizer.step()
137      optimizer.zero_grad()
138      dtb = time.perf_counter() - t1
139      logger.info(
140          ', '.join([
141              f'{iter=}',
142              f'loss={loss.item():.5f}',
143              f'dt={dtf+dtb:.3f}',
144              f'{dtf=:.3f}',
145              f'{dtb=:.3f}'
146          ])
147      )
148  if RANK == 0:
149      plot_losses(losses)
150
151
152if __name__ == '__main__':
153  main()

πŸƒπŸ»β€β™‚οΈRunning #

  1. Install:

    1$ git clone https://github.com/saforem2/ezpz
    2$ python3 -m pip install -e ezpz
    
  2. [optional] If using PBS or slurm:

    • Save Job info:
      • savejobenv:

         1$ source ezpz/src/ezpz/bin/savejobenv
         2β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
         3β”‚ [savejobenv]:
         4β”‚     β€’ Writing PBS vars to: /home/foremans/.pbsenv
         5β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
         6β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
         7β”‚ [HOSTS]:
         8β”‚     β€’ [host:0] - x1921c0s2b0n0.hostmgmt2000.cm.americas.sgi.com
         9β”‚     β€’ [host:1] - x1921c0s7b0n0.hostmgmt2000.cm.americas.sgi.com
        10β”‚     β€’ [host:2] - x1921c4s1b0n0.hostmgmt2000.cm.americas.sgi.com
        11β”‚     β€’ [host:3] - x1921c4s5b0n0.hostmgmt2000.cm.americas.sgi.com
        12β”‚     β€’ [host:4] - x1921c4s6b0n0.hostmgmt2000.cm.americas.sgi.com
        13β”‚     β€’ [host:5] - x1921c4s7b0n0.hostmgmt2000.cm.americas.sgi.com
        14β”‚     β€’ [host:6] - x1921c5s0b0n0.hostmgmt2000.cm.americas.sgi.com
        15β”‚     β€’ [host:7] - x1921c5s1b0n0.hostmgmt2000.cm.americas.sgi.com
        16β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
        17β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
        18β”‚ [DIST INFO]:
        19β”‚     β€’ HOSTFILE=/var/spool/pbs/aux/9003148.amn-0001
        20β”‚     β€’ NHOSTS=8
        21β”‚     β€’ NGPU_PER_HOST=12
        22β”‚     β€’ NGPUS=96
        23β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
        24β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
        25β”‚ [LAUNCH]:
        26β”‚     β€’ To launch across all available GPUs, use:
        27β”‚       'launch' ( = mpiexec --verbose --envall -n 96 -ppn 12 --hostfile /var/spool/pbs/aux/9003148.amn-0001 )
        28β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
        
  1. Launch test_dist.py

    • DDP:

      1$ launch python3 -m ezpz.test_dist
      
    • DeepSpeed:

      1$ BACKEND=deepspeed launch python3 -m ezpz.test_dist --deepspeed --deepspeed_config ezpz/src/ezpz/conf/ds_config.json
      
    • Output:

      • GPU
        ```bash
        $ launch python3 -m ezpz.test_dist |& tee ezpz-test-dist.log
        
        Connected to tcp://x3005c0s13b0n0.hsn.cm.polaris.alcf.anl.gov:7919
        Found executable /lus/eagle/projects/datascience/foremans/miniconda3/envs/2024-04-20/bin/python3
        Launching application 9e4c8311-1729-4385-b1d2-d4cd6006ac1d
        [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=1/7][local_rank=1/3][node=1/1]
        [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=5/7][local_rank=1/3][node=1/1]
        [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=3/7][local_rank=3/3][node=1/1]
        [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=7/7][local_rank=3/3][node=1/1]
        [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=4/7][local_rank=0/3][node=0/1]
        [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=6/7][local_rank=2/3][node=0/1]
        [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=2/7][local_rank=2/3][node=0/1]
        [2024-04-20 19:26:22][INFO][dist:290] - [device='cuda'][rank=0/7][local_rank=0/3][node=0/1]
        [2024-04-20 19:26:22][WARNING][dist:296] - Using [8 / 8] available "cuda" devices !!
        [2024-04-20 19:26:22][INFO][test_dist:46] - DIST_INIT={'world_size': 8, 'rank': 0, 'local_rank': 0}
        [2024-04-20 19:26:24][INFO][test_dist:84] - model=Network(
          (layers): Sequential(
            (0): Linear(in_features=128, out_features=1024, bias=True)
            (1): Linear(in_features=1024, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=256, bias=True)
            (3): Linear(in_features=256, out_features=128, bias=True)
            (4): Linear(in_features=128, out_features=128, bias=True)
          )
        )
        [2024-04-20 19:26:28][INFO][test_dist:126] - iter=0, loss=2789.99072, dt=0.664, dtf=0.659, dtb=0.005
        [2024-04-20 19:26:28][INFO][test_dist:126] - iter=1, loss=1961.33459, dt=0.002, dtf=0.001, dtb=0.002
        [2024-04-20 19:26:28][INFO][test_dist:126] - iter=2, loss=1450.47461, dt=0.002, dtf=0.000, dtb=0.002
        [2024-04-20 19:26:28][INFO][test_dist:126] - iter=3, loss=1088.81958, dt=0.002, dtf=0.000, dtb=0.002
        [2024-04-20 19:26:28][INFO][test_dist:126] - iter=4, loss=945.28839, dt=0.002, dtf=0.000, dtb=0.002
        [2024-04-20 19:26:28][INFO][test_dist:126] - iter=5, loss=906.78857, dt=0.002, dtf=0.000, dtb=0.001
        [2024-04-20 19:26:28][INFO][test_dist:126] - iter=6, loss=789.18243, dt=0.002, dtf=0.000, dtb=0.002
        [2024-04-20 19:26:28][INFO][test_dist:126] - iter=7, loss=751.63477, dt=0.002, dtf=0.000, dtb=0.002
        [2024-04-20 19:26:28][INFO][test_dist:126] - iter=8, loss=735.62915, dt=0.002, dtf=0.000, dtb=0.002
        [2024-04-20 19:26:28][INFO][test_dist:126] - iter=9, loss=732.12775, dt=0.002, dtf=0.000, dtb=0.001
        ```
        
      • XPU
         1# [04:50:57 PM] [foremans@x1921c0s0b0n0] ~/q/llm.devkit/Megatron-DeepSpeed/dep/ezpz/s/ezpz  main q4-drop 32s
         2$ launch python3 -Wignore test_dist.py
         3Connected to tcp://x1921c0s0b0n0.hostmgmt2000.cm.americas.sgi.com:7919
         4Found executable /home/foremans/miniconda3/envs/q4-drop/bin/python3
         5Launching application 5bf3e9e8-89fb-412a-a49e-3c81601436b7
         6[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=9/23][local_rank=9/11][node=1/1]
         7[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=14/23][local_rank=2/11][node=0/1]
         8[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=3/23][local_rank=3/11][node=1/1]
         9[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=17/23][local_rank=5/11][node=1/1]
        10[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=6/23][local_rank=6/11][node=0/1]
        11[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=13/23][local_rank=1/11][node=1/1]
        12[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=7/23][local_rank=7/11][node=1/1]
        13[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=19/23][local_rank=7/11][node=1/1]
        14[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=8/23][local_rank=8/11][node=0/1]
        15[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=21/23][local_rank=9/11][node=1/1]
        16[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=10/23][local_rank=10/11][node=0/1]
        17[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=22/23][local_rank=10/11][node=0/1]
        18[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=11/23][local_rank=11/11][node=1/1]
        19[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=23/23][local_rank=11/11][node=1/1]
        20[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=2/23][local_rank=2/11][node=0/1]
        21[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=20/23][local_rank=8/11][node=0/1]
        22[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=4/23][local_rank=4/11][node=0/1]
        23[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=15/23][local_rank=3/11][node=1/1]
        24[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=18/23][local_rank=6/11][node=0/1]
        25[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=12/23][local_rank=0/11][node=0/1]
        26[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=1/23][local_rank=1/11][node=1/1]
        27[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=16/23][local_rank=4/11][node=0/1]
        28[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=5/23][local_rank=5/11][node=1/1]
        29[2024-04-19 16:51:06][INFO][dist:239] - DistInfo={
        30    "DEVICE": "xpu",
        31    "DEVICE_ID": "xpu:0",
        32    "DISTRIBUTED_BACKEND": "ccl",
        33    "GPUS_PER_NODE": 12,
        34    "HOSTFILE": "/var/spool/pbs/aux/8992337.amn-0001",
        35    "HOSTNAME": "x1921c0s0b0n0.hostmgmt2000.cm.americas.sgi.com",
        36    "HOSTS": "['x1921c0s0b0n0', 'x1921c0s5b0n0']",
        37    "LOCAL_RANK": 0,
        38    "MACHINE": "SunSpot",
        39    "NGPUS": 24,
        40    "NODE_ID": 0,
        41    "NUM_NODES": 2,
        42    "RANK": 0,
        43    "SCHEDULER": "PBS",
        44    "WORLD_SIZE_IN_USE": 24,
        45    "WORLD_SIZE_TOTAL": 24
        46}
        47[2024-04-19 16:51:06][INFO][dist:602] - Using oneccl_bindings from: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/torch-ccl/oneccl_bindings_for_pytorch/__init__.py
        48[2024-04-19 16:51:06][INFO][dist:604] - Using ipex from: /home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/intel_extension_for_pytorch/__init__.py
        49[2024-04-19 16:51:06][INFO][dist:605] - [0/24] Using device='xpu' with backend='DDP' + 'ccl' for distributed training.
        50[2024-04-19 16:51:06][INFO][dist:290] - [device='xpu'][rank=0/23][local_rank=0/11][node=0/1]
        51[2024-04-19 16:51:06][WARNING][dist:296] - Using [24 / 24] available "xpu" devices !!
        522024:04:19-16:51:06:(16909) |CCL_WARN| MPI was initialized externally, CCL-MPI specific environment is ignored
        53[2024-04-19 16:51:06][INFO][test_dist:71] - model=Network(
        54  (layers): Sequential(
        55    (0): Linear(in_features=128, out_features=1024, bias=True)
        56    (1): Linear(in_features=1024, out_features=512, bias=True)
        57    (2): Linear(in_features=512, out_features=256, bias=True)
        58    (3): Linear(in_features=256, out_features=128, bias=True)
        59    (4): Linear(in_features=128, out_features=128, bias=True)
        60  )
        61)
        62[2024-04-19 16:51:18][INFO][test_dist:101] - iter=0, loss=2709.53418, dt=1.380, dtf=0.950, dtb=0.430
        63[2024-04-19 16:51:18][INFO][test_dist:101] - iter=1, loss=2058.49805, dt=0.133, dtf=0.002, dtb=0.131
        64[2024-04-19 16:51:18][INFO][test_dist:101] - iter=2, loss=1507.91187, dt=0.004, dtf=0.001, dtb=0.004
        65[2024-04-19 16:51:18][INFO][test_dist:101] - iter=3, loss=1181.78577, dt=0.004, dtf=0.001, dtb=0.003
        66[2024-04-19 16:51:18][INFO][test_dist:101] - iter=4, loss=949.43561, dt=0.004, dtf=0.001, dtb=0.003
        67[2024-04-19 16:51:18][INFO][test_dist:101] - iter=5, loss=848.14905, dt=0.004, dtf=0.001, dtb=0.003
        68[2024-04-19 16:51:18][INFO][test_dist:101] - iter=6, loss=788.76123, dt=0.004, dtf=0.001, dtb=0.003
        69[2024-04-19 16:51:18][INFO][test_dist:101] - iter=7, loss=753.59509, dt=0.004, dtf=0.001, dtb=0.003
        70[2024-04-19 16:51:18][INFO][test_dist:101] - iter=8, loss=750.62225, dt=0.004, dtf=0.001, dtb=0.003
        71[2024-04-19 16:51:18][INFO][test_dist:101] - iter=9, loss=740.23474, dt=0.004, dtf=0.001, dtb=0.003
        72Application 5bf3e9e8 resources: utime=621s stime=111s maxrss=1746816KB inblock=192 oublock=16 minflt=10719359 majflt=7493 nvcsw=169332 nivcsw=77546
        
      • CPU
         1$ TORCH_DEVICE=cpu mpirun -np 12 python3 test_dist.py
         2[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=1/11][local_rank=1/11][node=0/0]
         3[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=3/11][local_rank=3/11][node=0/0]
         4[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=6/11][local_rank=6/11][node=0/0]
         5[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=5/11][local_rank=5/11][node=0/0]
         6[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=2/11][local_rank=2/11][node=0/0]
         7[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=10/11][local_rank=10/11][node=0/0]
         8[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=4/11][local_rank=4/11][node=0/0]
         9[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=7/11][local_rank=7/11][node=0/0]
        10[2024-04-19 14:44:12][INFO][dist:290] - [device='cpu'][rank=9/11][local_rank=9/11][node=0/0]
        11[2024-04-19 14:44:13][INFO][dist:290] - [device='cpu'][rank=11/11][local_rank=11/11][node=0/0]
        12[2024-04-19 14:44:13][INFO][dist:290] - [device='cpu'][rank=8/11][local_rank=8/11][node=0/0]
        13[2024-04-19 14:44:13][INFO][dist:239] - DistInfo={
        14    "DEVICE": "cpu",
        15    "DEVICE_ID": "cpu:0",
        16    "DISTRIBUTED_BACKEND": "gloo",
        17    "GPUS_PER_NODE": 12,
        18    "HOSTFILE": "/Users/samforeman/projects/saforem2/ezpz/src/ezpz/hostfile",
        19    "HOSTNAME": "Sams-MacBook-Pro.local",
        20    "HOSTS": "['Sams-MacBook-Pro']",
        21    "LOCAL_RANK": 0,
        22    "MACHINE": "Sams-MacBook-Pro.local",
        23    "NGPUS": 12,
        24    "NODE_ID": 0,
        25    "NUM_NODES": 1,
        26    "RANK": 0,
        27    "SCHEDULER": "LOCAL",
        28    "WORLD_SIZE_IN_USE": 12,
        29    "WORLD_SIZE_TOTAL": 12
        30}
        31[2024-04-19 14:44:13][INFO][dist:605] - [0/12] Using device='cpu' with backend='DDP' + 'gloo' for distributed training.
        32[2024-04-19 14:44:13][INFO][dist:290] - [device='cpu'][rank=0/11][local_rank=0/11][node=0/0]
        33[2024-04-19 14:44:13][WARNING][dist:296] - Using [12 / 12] available "cpu" devices !!
        34[2024-04-19 14:44:13][INFO][test_dist:72] - model=Network(
        35  (layers): Sequential(
        36    (0): Linear(in_features=128, out_features=1024, bias=True)
        37    (1): Linear(in_features=1024, out_features=512, bias=True)
        38    (2): Linear(in_features=512, out_features=256, bias=True)
        39    (3): Linear(in_features=256, out_features=128, bias=True)
        40    (4): Linear(in_features=128, out_features=128, bias=True)
        41  )
        42)
        43[2024-04-19 14:44:14][INFO][test_dist:102] - iter=0, loss=2801.62549, dt=0.389, dtf=0.042, dtb=0.348
        44[2024-04-19 14:44:14][INFO][test_dist:102] - iter=1, loss=2092.84692, dt=0.051, dtf=0.010, dtb=0.041
        45[2024-04-19 14:44:14][INFO][test_dist:102] - iter=2, loss=1482.45520, dt=0.037, dtf=0.004, dtb=0.033
        46[2024-04-19 14:44:14][INFO][test_dist:102] - iter=3, loss=1174.38037, dt=0.033, dtf=0.002, dtb=0.031
        47[2024-04-19 14:44:14][INFO][test_dist:102] - iter=4, loss=938.39917, dt=0.032, dtf=0.003, dtb=0.030
        48[2024-04-19 14:44:14][INFO][test_dist:102] - iter=5, loss=888.37390, dt=0.035, dtf=0.001, dtb=0.033
        49[2024-04-19 14:44:14][INFO][test_dist:102] - iter=6, loss=784.63470, dt=0.036, dtf=0.003, dtb=0.032
        50[2024-04-19 14:44:14][INFO][test_dist:102] - iter=7, loss=749.53839, dt=0.033, dtf=0.002, dtb=0.031
        51[2024-04-19 14:44:14][INFO][test_dist:102] - iter=8, loss=732.22656, dt=0.036, dtf=0.003, dtb=0.034
        52[2024-04-19 14:44:15][INFO][test_dist:102] - iter=9, loss=730.63776, dt=0.034, dtf=0.001, dtb=0.033
        5335.68s user 17.20s system 546% cpu 9.681s total
        

🧰 Helper Utilities #

We provide some shell scripts that are useful when working with a job scheduler (e.g.Β PBS Pro @ ALCF or slurm elsewhere).

β€οΈβ€πŸ©Ή Status

Last Updated: 05/13/2024 @ 22:04:56