I am trying to run a simple training script using HF's transformers library and am running into the error `Distributed package doesn't have nccl built in` error.
Runtime: DBR 13.0 ML - SPark 3.4.0 - Scala 2.12
Driver: i3.xlarge - 4 cores
Note: This is a CPU instance
I am trying to fine-tune a transformers model for Sequence Classification - essentially following this tutorial: https://huggingface.co/docs/transformers/training.
When I try to initialize TrainingArguments (TrainingArguments(output_dir="test_trainer")), I get the error trace
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
File <command-1074749622305054>:3
1 from transformers import TrainingArguments
----> 3 TrainingArguments(output_dir="test_trainer")
File <string>:108, in __init__(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_on_each_node, no_cuda, use_mps_device, seed, data_seed, jit_mode_eval, use_ipex, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, xpu_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, sharded_ddp, fsdp, fsdp_min_num_params, fsdp_transformer_layer_cls_to_wrap, deepspeed, label_smoothing_factor, optim, optim_args, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, dataloader_pin_memory, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, hub_private_repo, gradient_checkpointing, include_inputs_for_metrics, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, auto_find_batch_size, full_determinism, torchdynamo, ray_scope, ddp_timeout, torch_compile, torch_compile_backend, torch_compile_mode)
File /databricks/python/lib/python3.10/site-packages/transformers/training_args.py:1172, in TrainingArguments.__post_init__(self)
1162 warnings.warn(
1163 "`--adafactor` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--optim"
1164 " adafactor` instead",
1165 FutureWarning,
1166 )
1167 self.optim = OptimizerNames.ADAFACTOR
1169 if (
1170 self.framework == "pt"
1171 and is_torch_available()
-> 1172 and (self.device.type != "cuda")
1173 and (get_xla_device_type(self.device) != "GPU")
1174 and (self.fp16 or self.fp16_full_eval)
1175 ):
1176 raise ValueError(
1177 "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
1178 " (`--fp16_full_eval`) can only be used on CUDA devices."
1179 )
1181 if (
1182 self.framework == "pt"
1183 and is_torch_available()
(...)
1188 and (self.bf16 or self.bf16_full_eval)
1189 ):
File /databricks/python/lib/python3.10/site-packages/transformers/training_args.py:1556, in TrainingArguments.device(self)
1552 """
1553 The device used by this process.
1554 """
1555 requires_backends(self, ["torch"])
-> 1556 return self._setup_devices
File /databricks/python/lib/python3.10/site-packages/transformers/utils/generic.py:57, in cached_property.__get__(self, obj, objtype)
55 cached = getattr(obj, attr, None)
56 if cached is None:
---> 57 cached = self.fget(obj)
58 setattr(obj, attr, cached)
59 return cached
File /databricks/python/lib/python3.10/site-packages/transformers/training_args.py:1541, in TrainingArguments._setup_devices(self)
1537 else:
1538 # Here, we'll use torch.distributed.
1539 # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
1540 if not torch.distributed.is_initialized():
-> 1541 torch.distributed.init_process_group(backend="nccl", timeout=self.ddp_timeout_delta)
1542 device = torch.device("cuda", self.local_rank)
1543 self._n_gpu = 1
File /databricks/python/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:761, in init_process_group(backend, init_method, timeout, world_size, rank, store, group_name, pg_options)
757 # Use a PrefixStore to avoid accidental overrides of keys used by
758 # different systems (e.g. RPC) in case the store is multi-tenant.
759 store = PrefixStore("default_pg", store)
--> 761 default_pg = _new_process_group_helper(
762 world_size,
763 rank,
764 [],
765 backend,
766 store,
767 pg_options=pg_options,
768 group_name=group_name,
769 timeout=timeout,
770 )
771 _update_default_pg(default_pg)
773 _pg_group_ranks[GroupMember.WORLD] = {i: i for i in range(GroupMember.WORLD.size())} # type: ignore[attr-defined, index]
File /databricks/python/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:886, in _new_process_group_helper(group_size, group_rank, global_ranks_in_group, backend, store, pg_options, group_name, timeout)
884 elif backend == Backend.NCCL:
885 if not is_nccl_available():
--> 886 raise RuntimeError("Distributed package doesn't have NCCL " "built in")
887 if pg_options is not None:
888 assert isinstance(
889 pg_options, ProcessGroupNCCL.Options
890 ), "Expected pg_options argument to be of type ProcessGroupNCCL.Options"
RuntimeError: Distributed package doesn't have NCCL built in
---
I have tried the following fix with no effect.
```
import os
os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "gloo"
```
I can not find any other pointers.
Can anyone please give suggestions on what may be going on?